From c7f56647c37e99679f804322c2147e984e3f4f7f Mon Sep 17 00:00:00 2001 From: "Xiaolin (Charlene) Zang" Date: Fri, 11 Aug 2017 13:47:59 -0400 Subject: [PATCH 001/131] When considering a vm for a job, ask aws if it is still running. --- vmms/ec2SSH.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 303bf381..bfddac86 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -20,6 +20,8 @@ from boto import ec2 from tangoObjects import TangoMachine +### added to suppress boto XML output -- Jason Boles +logging.getLogger('boto').setLevel(logging.CRITICAL) def timeout(command, time_out=1): """ timeout - Run a unix command with a timeout. Return -1 on @@ -261,6 +263,13 @@ def waitVM(self, vm, max_secs): VM is a boto.ec2.instance.Instance object. """ + self.log.info("WaitVM: %s, ec2_id: %s" % (vm.name, vm.ec2_id)) + + # test if the vm is still an instance + if not self.existsVM(vm): + self.log.info("VM %s: no longer an instance" % (vm.name)) + return -1 + # First, wait for ping to the vm instance to work instance_down = 1 instanceName = self.instanceName(vm.id, vm.name) @@ -434,7 +443,7 @@ def existsVM(self, vm): instances = self.connection.get_all_instances() for inst in instances: - if inst.instances[0].id is vm.ec2_id: + if inst.instances[0].id == vm.ec2_id and inst.instances[0].state == "running": return True return False From 4dcbbb4dfef096f3e64ef91f3eff4bf9d82b66b6 Mon Sep 17 00:00:00 2001 From: "Xiaolin (Charlene) Zang" Date: Fri, 11 Aug 2017 13:50:31 -0400 Subject: [PATCH 002/131] When number of jobs is larger than the number of available vms, N, jobManager dies after N jobs have finished in the exception handling of __manage(), if Config.REUSE_VM is set to true. This commit simply checks whether "job" is None, to avoid the crash. This allows job manager to continue and finish all jobs. But it may not be the real fix of the root problem which needs further investigation. --- jobManager.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/jobManager.py b/jobManager.py index 7ec31aee..a0248789 100644 --- a/jobManager.py +++ b/jobManager.py @@ -9,7 +9,7 @@ # is launched that will handle things from here on. If anything goes # wrong, the job is made dead with the error. # -import threading, logging, time, copy +import threading, logging, time, copy, os from datetime import datetime from tango import * @@ -27,10 +27,11 @@ def __init__(self, queue): self.jobQueue = queue self.preallocator = self.jobQueue.preallocator self.vmms = self.preallocator.vmms - self.log = logging.getLogger("JobManager") + self.log = logging.getLogger("JobManager-" + str(os.getpid())) # job-associated instance id self.nextId = 10000 self.running = False + self.log.info("START jobManager") def start(self): if self.running: @@ -61,14 +62,24 @@ def __manage(self): id = self.jobQueue.getNextPendingJob() if id: + self.log.info("_manage job after getNextPendingJob() %s" % id) + job = self.jobQueue.get(id) + if job is not None: + jobStr = ', '.join("%s: %s" % item for item in job.__dict__.items()) + self.log.info("_manage job %s" % jobStr) if not job.accessKey and Config.REUSE_VMS: id, vm = self.jobQueue.getNextPendingJobReuse(id) job = self.jobQueue.get(id) - + if job is not None: + jobStr = ', '.join("%s: %s" % item for item in job.__dict__.items()) + self.log.info("_manage after getNextPendingJobReuse %s" % jobStr) + else: + self.log.info("_manage after getNextPendingJobReuse %s %s" % (id, vm)) try: # Mark the job assigned self.jobQueue.assignJob(job.id) + self.log.info("_manage after assignJob %s" % id) # if the job has specified an account # create an VM on the account and run on that instance if job.accessKeyId: @@ -77,13 +88,16 @@ def __manage(self): newVM = copy.deepcopy(job.vm) newVM.id = self._getNextID() preVM = vmms.initializeVM(newVM) + self.log.info("_manage init new vm %s" % preVM.id) else: # Try to find a vm on the free list and allocate it to # the worker if successful. if Config.REUSE_VMS: preVM = vm + self.log.info("_manage reuse vm %s" % preVM.id) else: preVM = self.preallocator.allocVM(job.vm.name) + self.log.info("_manage allocate vm %s" % preVM.id) vmms = self.vmms[job.vm.vmms] # Create new vmms object # Now dispatch the job to a worker @@ -102,7 +116,11 @@ def __manage(self): ).start() except Exception as err: - self.jobQueue.makeDead(job.id, str(err)) + if job is not None: + # if True: + self.jobQueue.makeDead(job.id, str(err)) + else: + self.log.info("_manage: job is None") # Sleep for a bit and then check again time.sleep(Config.DISPATCH_PERIOD) From 6de4692964ecb2f5d58d9099a3f2924cbf786263 Mon Sep 17 00:00:00 2001 From: "Xiaolin (Charlene) Zang" Date: Fri, 11 Aug 2017 14:00:01 -0400 Subject: [PATCH 003/131] Add pid in the logs of the modules used by jobManager. --- jobQueue.py | 1 + preallocator.py | 19 ++++++++++++++++--- vmms/ec2SSH.py | 14 ++++++++------ worker.py | 3 ++- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/jobQueue.py b/jobQueue.py index ad43e3b9..1087d937 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -215,6 +215,7 @@ def getNextPendingJobReuse(self, target_id=None): # is a free VM if (job.isNotAssigned()): vm = self.preallocator.allocVM(job.vm.name) + self.log.info("getNextPendingJobReuse alloc vm %s for %s" % (id, vm)) if vm: self.queueLock.release() return (id, vm) diff --git a/preallocator.py b/preallocator.py index 026c09f5..0b356cae 100644 --- a/preallocator.py +++ b/preallocator.py @@ -1,7 +1,7 @@ # # preallocator.py - maintains a pool of active virtual machines # -import threading, logging, time, copy +import threading, logging, time, copy, os from tangoObjects import TangoDictionary, TangoQueue, TangoIntValue from config import Config @@ -24,7 +24,7 @@ def __init__(self, vmms): self.lock = threading.Lock() self.nextID = TangoIntValue("nextID", 1000) self.vmms = vmms - self.log = logging.getLogger("Preallocator") + self.log = logging.getLogger("Preallocator-" + str(os.getpid())) def poolSize(self, vmName): """ poolSize - returns the size of the vmName pool, for external callers @@ -93,6 +93,7 @@ def freeVM(self, vm): self.lock.acquire() if vm and vm.id in self.machines.get(vm.name)[0]: machine = self.machines.get(vm.name) + self.log.info("freeVM: return %s to free pool" % vm.id) machine[1].put(vm) self.machines.set(vm.name, machine) else: @@ -101,6 +102,7 @@ def freeVM(self, vm): # The VM is no longer in the pool. if not_found: + self.log.info("freeVM: will destroy %s" % vm.id) vmms = self.vmms[vm.vmms] vmms.safeDestroyVM(vm) @@ -118,6 +120,7 @@ def removeVM(self, vm): """ self.lock.acquire() machine = self.machines.get(vm.name) + self.log.info("removeVM: %s" % vm.id) machine[0].remove(vm.id) self.machines.set(vm.name, machine) self.lock.release() @@ -144,6 +147,9 @@ def __create(self, vm, cnt): This function should always be called in a thread since it might take a long time to complete. """ + + result = self.getPool("default") + vmms = self.vmms[vm.vmms] self.log.debug("__create: Using VMMS %s " % (Config.VMMS_NAME)) for i in range(cnt): @@ -173,6 +179,7 @@ def __destroy(self, vm): self.lock.release() if dieVM: + self.log.info("__destroy: %s" % vm.id) self.removeVM(dieVM) vmms = self.vmms[vm.vmms] vmms.safeDestroyVM(dieVM) @@ -188,7 +195,7 @@ def createVM(self, vm): self.log.info("createVM|calling initializeVM") vmms.initializeVM(newVM) - self.log.info("createVM|done with initializeVM") + self.log.info("createVM|done with initializeVM %s" % newVM.id) self.addVM(newVM) self.freeVM(newVM) @@ -207,12 +214,15 @@ def destroyVM(self, vmName, id): dieVM = None self.lock.acquire() size = self.machines.get(vmName)[1].qsize() + self.log.info("destroyVM: free:total pool %d:%d" % (size, len(self.machines.get(vmName)[0]))) if (size == len(self.machines.get(vmName)[0])): for i in range(size): vm = self.machines.get(vmName)[1].get_nowait() if vm.id != id: + self.log.info("destroyVM: put to free pool id:vm.id %s:%s" % (id, vm.id)) self.machines.get(vmName)[1].put(vm) else: + self.log.info("destroyVM: will call removeVM %s" % id) dieVM = vm self.lock.release() @@ -252,4 +262,7 @@ def getPool(self, vmName): result["total"] = self.machines.get(vmName)[0] result["free"] = free_list + self.log.info("getPool: free pool %s" % ', '.join(str(x) for x in result["free"])) + self.log.info("getPool: total pool %s" % ', '.join(str(x) for x in result["total"])) + return result diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index bfddac86..ce6581f7 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -103,7 +103,7 @@ def __init__(self, accessKeyId=None, accessKey=None): else: self.connection = ec2.connect_to_region(config.Config.EC2_REGION) self.useDefaultKeyPair = True - self.log = logging.getLogger("Ec2SSH") + self.log = logging.getLogger("Ec2SSH-" + str(os.getpid())) def instanceName(self, id, name): """ instanceName - Constructs a VM instance name. Always use @@ -263,18 +263,19 @@ def waitVM(self, vm, max_secs): VM is a boto.ec2.instance.Instance object. """ - self.log.info("WaitVM: %s, ec2_id: %s" % (vm.name, vm.ec2_id)) + self.log.info("WaitVM: %s, ec2_id: %s" % (vm.id, vm.ec2_id)) # test if the vm is still an instance if not self.existsVM(vm): - self.log.info("VM %s: no longer an instance" % (vm.name)) - return -1 + self.log.info("VM %s: no longer an instance" % vm.id) + return -1 # First, wait for ping to the vm instance to work instance_down = 1 instanceName = self.instanceName(vm.id, vm.name) start_time = time.time() domain_name = self.domainName(vm) + self.log.info("WaitVM: pinging %s" % domain_name) while instance_down: instance_down = subprocess.call("ping -c 1 %s" % (domain_name), shell=True, @@ -287,11 +288,12 @@ def waitVM(self, vm, max_secs): time.sleep(config.Config.TIMER_POLL_INTERVAL) elapsed_secs = time.time() - start_time if (elapsed_secs > max_secs): + self.log.debug("WAITVM_TIMEOUT: %s" % vm.id) return -1 # The ping worked, so now wait for SSH to work before # declaring that the VM is ready - self.log.debug("VM %s: ping completed" % (vm.name)) + self.log.debug("VM %s: ping completed" % (vm.id)) while(True): elapsed_secs = time.time() - start_time @@ -432,7 +434,7 @@ def getVMs(self): vm.ec2_id = inst.id vm.name = str(inst.tags.get('Name')) self.log.debug('getVMs: Instance - %s, EC2 Id - %s' % - (vm.name, vm.ec2_id)) + (vm.id, vm.ec2_id)) vms.append(vm) return vms diff --git a/worker.py b/worker.py index e7ffec25..3c9af433 100644 --- a/worker.py +++ b/worker.py @@ -35,7 +35,7 @@ def __init__(self, job, vmms, jobQueue, preallocator, preVM): self.preallocator = preallocator self.preVM = preVM threading.Thread.__init__(self) - self.log = logging.getLogger("Worker") + self.log = logging.getLogger("Worker-" + str(os.getpid())) # # Worker helper functions @@ -61,6 +61,7 @@ def detachVM(self, return_vm=False, replace_vm=False): # replacement has been created. Otherwise there is a # potential race where the job manager thinks that the # pool is empty and creates a spurious vm. + self.log.info("removeVM %s" % self.job.vm.id); self.preallocator.removeVM(self.job.vm) def rescheduleJob(self, hdrfile, ret, err): From e2afe8a7d73bbd633282a35ec71ea690d2bb1db0 Mon Sep 17 00:00:00 2001 From: "Xiaolin (Charlene) Zang" Date: Fri, 11 Aug 2017 14:02:10 -0400 Subject: [PATCH 004/131] When job manager restarts, it empties its vm "total" pool and "free" queue. Due to misunderstanding of redis, the free queue is not actually emptied, resulting in vms staying in free queue but not in total pool. --- jobManager.py | 8 +++++++- preallocator.py | 5 +++-- requirements.txt | 1 + tangoObjects.py | 16 +++++++++++++++- 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/jobManager.py b/jobManager.py index a0248789..40412898 100644 --- a/jobManager.py +++ b/jobManager.py @@ -117,7 +117,6 @@ def __manage(self): except Exception as err: if job is not None: - # if True: self.jobQueue.makeDead(job.id, str(err)) else: self.log.info("_manage: job is None") @@ -137,6 +136,13 @@ def __manage(self): tango.resetTango(tango.preallocator.vmms) for key in tango.preallocator.machines.keys(): tango.preallocator.machines.set(key, [[], TangoQueue(key)]) + + # The above call sets the total pool empty. But the free pool which + # is a queue in redis, may not be empty. When the job manager restarts, + # resetting the free queue using the key doesn't change its content. + # Therefore we empty the queue, thus the free pool, to keep it consistent + # with the total pool. + tango.preallocator.machines.get(key)[1].make_empty() jobs = JobManager(tango.jobQueue) print("Starting the stand-alone Tango JobManager") diff --git a/preallocator.py b/preallocator.py index 0b356cae..e2665238 100644 --- a/preallocator.py +++ b/preallocator.py @@ -46,6 +46,8 @@ def update(self, vm, num): self.lock.acquire() if vm.name not in self.machines.keys(): self.machines.set(vm.name, [[], TangoQueue(vm.name)]) + # see comments in jobManager.py for the same call + self.machines.get(vm.name)[1].make_empty() self.log.debug("Creating empty pool of %s instances" % (vm.name)) self.lock.release() @@ -112,6 +114,7 @@ def addVM(self, vm): self.lock.acquire() machine = self.machines.get(vm.name) machine[0].append(vm.id) + self.log.info("addVM: add %s" % vm.id) self.machines.set(vm.name, machine) self.lock.release() @@ -148,8 +151,6 @@ def __create(self, vm, cnt): might take a long time to complete. """ - result = self.getPool("default") - vmms = self.vmms[vm.vmms] self.log.debug("__create: Using VMMS %s " % (Config.VMMS_NAME)) for i in range(cnt): diff --git a/requirements.txt b/requirements.txt index e5a20156..28d7484f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ backports.ssl-match-hostname==3.4.0.2 +boto3 boto==2.27.0 futures==2.2.0 plumbum==1.4.2 diff --git a/tangoObjects.py b/tangoObjects.py index 17e4130f..b82a99d0 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -59,7 +59,7 @@ def __init__(self, name="DefaultTestVM", image=None, vmms=None, self.instance_id = id def __repr__(self): - return "TangoMachine(image: %s, vmms: %s)" % (self.image, self.vmms) + return "TangoMachine(image: %s, vmms: %s, id: %s)" % (self.image, self.vmms, self.id) class TangoJob(): @@ -210,6 +210,14 @@ def __init__(self, name, namespace="queue"): self.__db = getRedisConnection() self.key = '%s:%s' % (namespace, name) + # for debugging. return a readable string representation + def dump(self): + unpickled_obj = self.__db.lrange(self.key, 0, -1) + objs = [] + for obj in unpickled_obj: + objs.append(pickle.loads(obj)) + return objs + def qsize(self): """Return the approximate size of the queue.""" return self.__db.llen(self.key) @@ -239,6 +247,12 @@ def get(self, block=True, timeout=None): item = pickle.loads(item) return item + def make_empty(self): + while True: + item = self.__db.lpop(self.key) + if item is None: + break + def get_nowait(self): """Equivalent to get(False).""" return self.get(False) From 97c22e39bcadf37b784cc2a0db5ea6202a5634ab Mon Sep 17 00:00:00 2001 From: "Xiaolin (Charlene) Zang" Date: Fri, 11 Aug 2017 14:03:52 -0400 Subject: [PATCH 005/131] Add ability to pull amis from aws and to exact a tag "Name" as the image name for each ami. The lab author specifies the desired image using this name. --- vmms/ec2SSH.py | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index ce6581f7..8b6db1d7 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -18,6 +18,8 @@ import boto from boto import ec2 +import boto3 + from tangoObjects import TangoMachine ### added to suppress boto XML output -- Jason Boles @@ -105,6 +107,37 @@ def __init__(self, accessKeyId=None, accessKey=None): self.useDefaultKeyPair = True self.log = logging.getLogger("Ec2SSH-" + str(os.getpid())) + # Use boto3 to read images. Find the "Name" tag and use it as key to + # build a map from "Name tag" to boto3's image structure. + # The code is currently using boto 2 for most of the work and we don't + # have the energy to upgrade it yet. So boto and boto3 are used together. + + client = boto3.client("ec2", config.Config.EC2_REGION) + images = client.describe_images(Owners=["self"])["Images"] + self.img2ami = {} + for image in images: + if "Tags" not in image: + continue + tags = image["Tags"] + for tag in tags: + if "Key" in tag and tag["Key"] == "Name": + if not (tag["Value"] and tag["Value"].endswith(".img")): + self.log.info("Ignore %s for ill-formed name tag %s" % + (image["ImageId"], tag["Value"])) + continue + if tag["Value"] in self.img2ami: + self.log.info("Ignore %s for duplicate name tag %s" % + (image["ImageId"], tag["Value"])) + continue + + self.img2ami[tag["Value"]] = image + self.log.info("Found image: %s %s %s" % (tag["Value"], image["ImageId"], image["Name"])) + + imageAmis = [item["ImageId"] for item in images] + taggedAmis = [self.img2ami[key]["ImageId"] for key in self.img2ami] + ignoredAmis = list(set(imageAmis) - set(taggedAmis)) + self.log.info("Ignored amis %s due to lack of proper name tag" % str(ignoredAmis)) + def instanceName(self, id, name): """ instanceName - Constructs a VM instance name. Always use this function when you need a VM instance name. Never generate @@ -151,7 +184,8 @@ def tangoMachineToEC2Instance(self, vm): else: ec2instance['instance_type'] = config.Config.DEFAULT_INST_TYPE - ec2instance['ami'] = config.Config.DEFAULT_AMI + ec2instance['ami'] = self.img2ami[vm.name + ".img"]["ImageId"] + self.log.info("tangoMachineToEC2Instance: %s" % str(ec2instance)) return ec2instance @@ -452,4 +486,4 @@ def existsVM(self, vm): def getImages(self): """ getImages - return a constant; actually use the ami specified in config """ - return ["default.img"] + return list(self.img2ami.keys()) From e66551a53223b31c3baef74860eb845e4c2adac1 Mon Sep 17 00:00:00 2001 From: "Xiaolin (Charlene) Zang" Date: Fri, 11 Aug 2017 15:27:35 -0400 Subject: [PATCH 006/131] Remove DEFAULT_AMI since amis are automatically loaded from aws now. --- config.template.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/config.template.py b/config.template.py index c7a92007..a1f1b902 100644 --- a/config.template.py +++ b/config.template.py @@ -134,9 +134,20 @@ class Config: ###### # Part 5: EC2 Constants # + + # Special instructions to admin: Tango finds usable images from aws + # in the following fashion: + # It examines every ami (Amazon Image) owned by the EC2_USER_NAME, + # looks for a tag with the key "Name" (case sensitive), and use the value + # of the tag as the image name for the ami, for example, ubuntu.img or + # myImage.img. If an ami doesn't have such tag, it is ignored (watch + # for a log message). + # + # The lab author, when specifying an image to use, should specify one + # of those image names available. + EC2_REGION = '' EC2_USER_NAME = '' - DEFAULT_AMI = '' DEFAULT_INST_TYPE = '' DEFAULT_SECURITY_GROUP = '' SECURITY_KEY_PATH = '' From 94656b76005f9e9d71322d3454cdda5d959f4b75 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 15 Aug 2017 20:26:37 +0000 Subject: [PATCH 007/131] Add script to drive lab submissions into tango. --- tools/run_jobs.py | 99 ++++++++++++++++++++++++++++++++++++++++++++ tools/util.py | 103 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+) create mode 100644 tools/run_jobs.py create mode 100644 tools/util.py diff --git a/tools/run_jobs.py b/tools/run_jobs.py new file mode 100644 index 00000000..65ee182d --- /dev/null +++ b/tools/run_jobs.py @@ -0,0 +1,99 @@ +import os, re, glob, datetime, time + +from util import Config +from util import Cmd +from util import CommandLine +from util import Lab + +# drive student submissions to Tango. See ./util.py for preset configuratons. +# the script finds course and labs at a specified location and submits work +# from the handin directory. +# It then waits for all output files to have newer modification time. + +cfg = Config() +cmdLine = CommandLine(cfg) +cmd = Cmd(cfg) + +startTime = time.mktime(datetime.datetime.now().timetuple()) +outputFiles = [] + +# if either is None, then all student works are submitted. +firstStudentNum = 1 +totalStudents = 2 + +for labIndex in cmdLine.args.indecies: + if labIndex >= len(cfg.labs): + print("lab index %d is out of range" % labIndex) + exit(-1) + +for labIndex in cmdLine.args.indecies: + lab = Lab(cfg, labIndex) + cmd.info() + cmd.open(lab) + + students = [] + student2fileFullPath = {} + student2file = {} + + # get student handin files, the last submission for each student, + # and make a map from email to useful attrbutes + + for file in sorted(glob.glob(lab.handinFilesQuery)): + baseName = file.split("/").pop() + matchObj = re.match(r'(.*)_[0-9]+_(.*)', baseName, re.M|re.I) + email = matchObj.group(1) + + withoutSuffix = baseName.replace(lab.handinSuffix, "") + outputFile = withoutSuffix + "_" + lab.name + ".txt" + jobName = lab.courseLab + "_" + withoutSuffix + + if email not in students: + students.append(email) + studentFile = {"full": file, "base": baseName, "job": jobName, + "stripped": matchObj.group(2), "output": outputFile} + student2file[email] = studentFile + + # submit all student works or a given range + if not (firstStudentNum and totalStudents): + firstStudentNum = 0 + totalStudents = len(students) + + print ("# Found %d students for lab %s" % (len(students), lab.name)) + print ("# Students index range %d..%d" % (firstStudentNum, totalStudents)) + + # load lab files + cmd.upload(lab, lab.makefile) + cmd.upload(lab, lab.autogradeTar) + + # load and run student submission + for i in range (firstStudentNum, firstStudentNum + totalStudents): + print ("\n# Submit for %s @ %s" % (students[i], lab.name)) + cmd.upload(lab, student2file[students[i]]["full"]) + cmd.addJob(lab, student2file[students[i]]) + outputFiles.append(lab.outputDir + "/" + student2file[students[i]]["output"]) +# end of main loop "cmdLine.args.indecies" + +print "\nNow waiting for output files..." +remainingFiles = list(outputFiles) +numberRemaining = len(remainingFiles) +loopDelay = 5 + +while True: + time.sleep(loopDelay) + + finishedFiles = [] + for file in remainingFiles: + if os.path.getmtime(file) > startTime: + print("Output %s is ready" % file) + finishedFiles.append(file) + + remainingFiles = set(remainingFiles) - set(finishedFiles) + nFinished = numberRemaining - len(remainingFiles) + print("%d jobs finished in the last %d seconds" % (nFinished, loopDelay)) + now = time.mktime(datetime.datetime.now().timetuple()) + print("%s has passed\n" % (str(datetime.timedelta(seconds = now - startTime)))) + + numberRemaining = len(remainingFiles) + if numberRemaining == 0: + print "All output files are counted for :))" + break diff --git a/tools/util.py b/tools/util.py new file mode 100644 index 00000000..71870e1a --- /dev/null +++ b/tools/util.py @@ -0,0 +1,103 @@ +import subprocess, os, argparse + +class Config: + tangoDir = "/root/autolab-oneclick/server/Tango" + cliCmd = "python " + tangoDir + "/clients/tango-cli.py" + tangoPort = "8600" + tangoIP = "" + # output dir used by Tango for submissions + tangoFileRoot = "/root/autolab-oneclick/server/tango_courselabs" + + # course definition and handin files location + course = "czang-exp" + courseRoot = "/n/scratch/czang/f16/" + labs = [ + {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "746.img"}, + {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "newPool.img"}] + +class CommandLine(): + def printLabs(self, name=None): + print ("available tests:") + print ("index\ttest") + i = 0 + for lab in self.cfg.labs: + print ("%d\t%s" % (i, lab["name"])) + i += 1 + print + + def __init__(self, cfg): + self.cfg = cfg + parser = argparse.ArgumentParser(description='Drive jobs to Tango', + usage=self.printLabs()) + parser.add_argument('indecies', metavar='index', type=int, nargs='+', + help="index of a test") + self.args = parser.parse_args() + +# represent attributes associated to a given lab +class Lab: + def __init__(self, cfg, labIndex): + self.cfg = cfg + self.name = cfg.labs[labIndex]["name"] + self.handinSuffix = cfg.labs[labIndex]["handinSuffix"] + self.image = cfg.labs[labIndex]["image"] + self.courseLab = cfg.course + "." + self.name + self.courseLabDir = cfg.courseRoot + "/" + self.name + self.makefile = self.courseLabDir + "/" + "autograde-Makefile" + self.autogradeTar = self.courseLabDir + "/" + "autograde.tar" + self.handinFilesQuery = "/".join([self.courseLabDir, + "handin", + "*" + self.handinSuffix]) + self.outputDir = None + if cfg.tangoFileRoot: + self.outputDir = "/".join([cfg.tangoFileRoot, + "test-" + self.courseLab, + "output"]) + +class Cmd: + def __init__(self, cfg): + self.cfg = cfg + outBytes = subprocess.check_output(["ps", "-auxw"]) + for line in outBytes.decode("utf-8").split("\n"): + if cfg.tangoPort in line: + argList = line.split() + for index, token in enumerate(argList): + if token == "-container-ip": + cfg.tangoIP = argList[index + 1] + if cfg.tangoIP == "": + print "ERROR: Cannot find tango server IP" + exit(-1) + + self.basic = cfg.cliCmd + self.basic += " -s " + cfg.tangoIP + " -P " + cfg.tangoPort + " -k test" + + print "CMD BASE:", self.basic + #end of __init__ + + def run(self, cmd): # an internal util function + print "EXEC tango-cli", cmd + os.system(self.basic + cmd) + print "=======================================" + + def info(self): + self.run(" --info") + + def open(self, lab): + self.run(" --open -l " + lab.courseLab) + + def upload(self, lab, file): + self.run(" --upload --filename " + file + " -l " + lab.courseLab) + + def addJob(self, lab, studentFile): + myCmd = " --addJob --image " + lab.image + " -l " + lab.courseLab + myCmd += " --jobname job_" + studentFile["job"] + myCmd += " --outputFile " + studentFile["output"] + myCmd += " --infiles" + myCmd += " '{\"localFile\": \"%s\", \"destFile\": \"%s\"}' " % \ + (studentFile["base"], studentFile["stripped"]) + myCmd += " '{\"localFile\": \"autograde-Makefile\", \"destFile\": \"Makefile\"}' " + myCmd += " '{\"localFile\": \"autograde.tar\", \"destFile\": \"autograde.tar\"}' " + self.run(myCmd) + + def poll(self, lab, studentFile): + myCmd = " --poll -l " + lab.courseLab + self.run(myCmd + " --outputFile " + studentFile["output"]) From 253cb8ea57e8f5f1c6ab5372d4ff730b2a78d2e1 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 15 Aug 2017 20:41:21 +0000 Subject: [PATCH 008/131] Add scripts to access ec2 and redis. --- tools/ec2Read.py | 14 ++++++++++++++ tools/redisRead.py | 17 +++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 tools/ec2Read.py create mode 100644 tools/redisRead.py diff --git a/tools/ec2Read.py b/tools/ec2Read.py new file mode 100644 index 00000000..4067c1dd --- /dev/null +++ b/tools/ec2Read.py @@ -0,0 +1,14 @@ +import os, sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from vmms.ec2SSH import Ec2SSH + +# test vmms.ec2SSH's image extraction code +# also serve as a template of accessing the ec2SSH vmms + +vmms = Ec2SSH() +for key in vmms.img2ami: + image = vmms.img2ami[key] + print image["Name"], image["ImageId"], key + + + diff --git a/tools/redisRead.py b/tools/redisRead.py new file mode 100644 index 00000000..943a553a --- /dev/null +++ b/tools/redisRead.py @@ -0,0 +1,17 @@ +import sys, os +# search parent dirs for importable packages +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from tangoObjects import TangoDictionary + +# list all "machines" pools and get the total and free sets for each pool +# it also serves as a template of extracting contents from redis + +machines = TangoDictionary("machines") +print "pools", machines.keys() + +for poolName in machines.keys(): + print "pool:", poolName + print "total:", machines.get(poolName)[0] + print "free:", machines.get(poolName)[1].qsize(), machines.get(poolName)[1].dump() + + From 45d5fc138e410ccb7b9b53b58b5e97d65e12725b Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 15 Aug 2017 20:57:55 +0000 Subject: [PATCH 009/131] remove trailing lines. --- tools/ec2Read.py | 3 --- tools/redisRead.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 4067c1dd..85a28ba0 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -9,6 +9,3 @@ for key in vmms.img2ami: image = vmms.img2ami[key] print image["Name"], image["ImageId"], key - - - diff --git a/tools/redisRead.py b/tools/redisRead.py index 943a553a..2a0b37e6 100644 --- a/tools/redisRead.py +++ b/tools/redisRead.py @@ -13,5 +13,3 @@ print "pool:", poolName print "total:", machines.get(poolName)[0] print "free:", machines.get(poolName)[1].qsize(), machines.get(poolName)[1].dump() - - From 12139e18158cab29d3ecc00c6082953cc772deed Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 16 Aug 2017 15:28:41 +0000 Subject: [PATCH 010/131] Check if output file exists before comparing modification time. --- tools/run_jobs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/run_jobs.py b/tools/run_jobs.py index 65ee182d..beb103b5 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -19,7 +19,7 @@ # if either is None, then all student works are submitted. firstStudentNum = 1 -totalStudents = 2 +totalStudents = 7 for labIndex in cmdLine.args.indecies: if labIndex >= len(cfg.labs): @@ -83,7 +83,7 @@ finishedFiles = [] for file in remainingFiles: - if os.path.getmtime(file) > startTime: + if os.path.exists(file) and os.path.getmtime(file) > startTime: print("Output %s is ready" % file) finishedFiles.append(file) From d8d0f656fb8cf10a6ca46900f812659026c2d2d8 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 18 Aug 2017 15:06:27 +0000 Subject: [PATCH 011/131] resetTango should only be called from jobManager. Remove the call from restful server. --- restful-tango/server.py | 1 - tango.py | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/restful-tango/server.py b/restful-tango/server.py index ef7ff653..bb1068f5 100755 --- a/restful-tango/server.py +++ b/restful-tango/server.py @@ -143,6 +143,5 @@ def post(self, key, image, num): if len(sys.argv) > 1: port = int(sys.argv[1]) - tangoREST.tango.resetTango(tangoREST.tango.preallocator.vmms) application.listen(port, max_buffer_size=Config.MAX_INPUT_FILE_SIZE) tornado.ioloop.IOLoop.instance().start() diff --git a/tango.py b/tango.py index 058c9930..ef153cc2 100755 --- a/tango.py +++ b/tango.py @@ -207,6 +207,9 @@ def getInfo(self): # # Helper functions # + + # NOTE: This function should be called by ONLY jobManager. The rest servers + # shouldn't call this function. def resetTango(self, vmms): """ resetTango - resets Tango to a clean predictable state and ensures that it has a working virtualization environment. A side From 9043e3a0851df6576712f30039cce33a5b2caca2 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 18 Aug 2017 15:10:12 +0000 Subject: [PATCH 012/131] Add and subtract some logging. --- jobManager.py | 4 ++-- jobQueue.py | 2 +- vmms/ec2SSH.py | 9 +++++++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/jobManager.py b/jobManager.py index 40412898..968837c0 100644 --- a/jobManager.py +++ b/jobManager.py @@ -62,12 +62,12 @@ def __manage(self): id = self.jobQueue.getNextPendingJob() if id: - self.log.info("_manage job after getNextPendingJob() %s" % id) + self.log.info("_manage: next job id %s" % id) job = self.jobQueue.get(id) if job is not None: jobStr = ', '.join("%s: %s" % item for item in job.__dict__.items()) - self.log.info("_manage job %s" % jobStr) + # self.log.info("_manage job %s" % jobStr) if not job.accessKey and Config.REUSE_VMS: id, vm = self.jobQueue.getNextPendingJobReuse(id) job = self.jobQueue.get(id) diff --git a/jobQueue.py b/jobQueue.py index 1087d937..3b53c1cd 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -257,7 +257,7 @@ def unassignJob(self, jobId): def makeDead(self, id, reason): """ makeDead - move a job from live queue to dead queue """ - self.log.info("makeDead| Making dead job ID: " + str(id)) + self.log.info("makeDead| Making dead job ID: " + str(id) + " " + reason) self.queueLock.acquire() self.log.debug("makeDead| Acquired lock to job queue.") status = -1 diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 8b6db1d7..bd4189ca 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -24,6 +24,8 @@ ### added to suppress boto XML output -- Jason Boles logging.getLogger('boto').setLevel(logging.CRITICAL) +logging.getLogger('boto3').setLevel(logging.CRITICAL) +logging.getLogger('botocore').setLevel(logging.CRITICAL) def timeout(command, time_out=1): """ timeout - Run a unix command with a timeout. Return -1 on @@ -136,7 +138,8 @@ def __init__(self, accessKeyId=None, accessKey=None): imageAmis = [item["ImageId"] for item in images] taggedAmis = [self.img2ami[key]["ImageId"] for key in self.img2ami] ignoredAmis = list(set(imageAmis) - set(taggedAmis)) - self.log.info("Ignored amis %s due to lack of proper name tag" % str(ignoredAmis)) + if (len(ignoredAmis) > 0): + self.log.info("Ignored amis %s due to lack of proper name tag" % str(ignoredAmis)) def instanceName(self, id, name): """ instanceName - Constructs a VM instance name. Always use @@ -231,6 +234,7 @@ def initializeVM(self, vm): try: instanceName = self.instanceName(vm.id, vm.name) ec2instance = self.tangoMachineToEC2Instance(vm) + self.log.info("initiliazeVM: %s %s" % (instanceName, str(ec2instance))) # ensure that security group exists self.createSecurityGroup() if self.useDefaultKeyPair: @@ -468,7 +472,7 @@ def getVMs(self): vm.ec2_id = inst.id vm.name = str(inst.tags.get('Name')) self.log.debug('getVMs: Instance - %s, EC2 Id - %s' % - (vm.id, vm.ec2_id)) + (vm.name, vm.ec2_id)) vms.append(vm) return vms @@ -486,4 +490,5 @@ def existsVM(self, vm): def getImages(self): """ getImages - return a constant; actually use the ami specified in config """ + self.log.info("getImages: %s" % str(list(self.img2ami.keys()))) return list(self.img2ami.keys()) From 4a6bba9563ed988c8d92b4896538a8212b343441 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 18 Aug 2017 21:24:40 +0000 Subject: [PATCH 013/131] Fix a condition for running all students' jobs. --- tools/run_jobs.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/run_jobs.py b/tools/run_jobs.py index beb103b5..16edda96 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -18,8 +18,8 @@ outputFiles = [] # if either is None, then all student works are submitted. -firstStudentNum = 1 -totalStudents = 7 +firstStudentNum = 0 +totalStudents = 1 for labIndex in cmdLine.args.indecies: if labIndex >= len(cfg.labs): @@ -53,8 +53,16 @@ "stripped": matchObj.group(2), "output": outputFile} student2file[email] = studentFile + # print the students and the indices + if False: + i = 0 + for student in students: + print i, student + i += 1 + exit() + # submit all student works or a given range - if not (firstStudentNum and totalStudents): + if firstStudentNum is None or totalStudents is None: firstStudentNum = 0 totalStudents = len(students) @@ -90,6 +98,7 @@ remainingFiles = set(remainingFiles) - set(finishedFiles) nFinished = numberRemaining - len(remainingFiles) print("%d jobs finished in the last %d seconds" % (nFinished, loopDelay)) + print("%d unfinished out of %d" % (len(remainingFiles), len(outputFiles))) now = time.mktime(datetime.datetime.now().timetuple()) print("%s has passed\n" % (str(datetime.timedelta(seconds = now - startTime)))) From b92ffbaa04bfe7c383a01b3adf0daef2202a0578 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 22 Aug 2017 21:56:27 +0000 Subject: [PATCH 014/131] Use tangoHostPort to distinguish multiple tango containers on the same host. --- tools/util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/util.py b/tools/util.py index 71870e1a..525a1b43 100644 --- a/tools/util.py +++ b/tools/util.py @@ -3,7 +3,7 @@ class Config: tangoDir = "/root/autolab-oneclick/server/Tango" cliCmd = "python " + tangoDir + "/clients/tango-cli.py" - tangoPort = "8600" + tangoHostPort = "host-port 8600" tangoIP = "" # output dir used by Tango for submissions tangoFileRoot = "/root/autolab-oneclick/server/tango_courselabs" @@ -58,7 +58,7 @@ def __init__(self, cfg): self.cfg = cfg outBytes = subprocess.check_output(["ps", "-auxw"]) for line in outBytes.decode("utf-8").split("\n"): - if cfg.tangoPort in line: + if cfg.tangoHostPort in line: argList = line.split() for index, token in enumerate(argList): if token == "-container-ip": @@ -68,7 +68,7 @@ def __init__(self, cfg): exit(-1) self.basic = cfg.cliCmd - self.basic += " -s " + cfg.tangoIP + " -P " + cfg.tangoPort + " -k test" + self.basic += " -s " + cfg.tangoIP + " -P 8600" + " -k test" print "CMD BASE:", self.basic #end of __init__ From 75ca36d4e10326592280246e5ef0047e20d077de Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 22 Aug 2017 22:00:51 +0000 Subject: [PATCH 015/131] Add logging in destroyVM. --- preallocator.py | 3 +++ vmms/ec2SSH.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/preallocator.py b/preallocator.py index e2665238..0f3270ba 100644 --- a/preallocator.py +++ b/preallocator.py @@ -216,6 +216,9 @@ def destroyVM(self, vmName, id): self.lock.acquire() size = self.machines.get(vmName)[1].qsize() self.log.info("destroyVM: free:total pool %d:%d" % (size, len(self.machines.get(vmName)[0]))) + # xxxXXX??? the following code is questionable: It requires that + # all vms are free. Otherwise it doesn't do anything. Is this desired + # behavior? if (size == len(self.machines.get(vmName)[0])): for i in range(size): vm = self.machines.get(vmName)[1].get_nowait() diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index bd4189ca..ec192d27 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -444,6 +444,8 @@ def copyOut(self, vm, destFile): def destroyVM(self, vm): """ destroyVM - Removes a VM from the system """ + + self.log.info("destroyVM: %s %s %s" % (vm.ec2_id, vm.name, vm.id)) ret = self.connection.terminate_instances(instance_ids=[vm.ec2_id]) # delete dynamically created key if not self.useDefaultKeyPair: From 93e60ada803514d4164237f5043bee95671259aa Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 23 Aug 2017 20:10:56 +0000 Subject: [PATCH 016/131] Modified pool allocation logic to 1) not to allocate all vms allowed by pool size at once and 2) consider vms in free pool first. --- config.template.py | 5 +++++ jobManager.py | 4 +++- jobQueue.py | 11 +++++++---- preallocator.py | 24 ++++++++++++++++++++++++ 4 files changed, 39 insertions(+), 5 deletions(-) diff --git a/config.template.py b/config.template.py index a1f1b902..17bbf6fd 100644 --- a/config.template.py +++ b/config.template.py @@ -57,6 +57,8 @@ class Config: NUM_THREADS = 20 # We have the option to reuse VMs or discard them after each use + # xxxXXX??? strongly suspect the code path for the False case + # not working, after a failed experiment. REUSE_VMS = True # Worker waits this many seconds for functions waitvm, copyin (per @@ -106,6 +108,9 @@ class Config: # Default vm pool size POOL_SIZE = 2 + # Default increment step when enlarging vm pool + POOL_ALLOC_INCREMENT = 2 + # Optionally log finer-grained timing information LOG_TIMING = False diff --git a/jobManager.py b/jobManager.py index 968837c0..0f6c11d1 100644 --- a/jobManager.py +++ b/jobManager.py @@ -94,8 +94,10 @@ def __manage(self): # the worker if successful. if Config.REUSE_VMS: preVM = vm - self.log.info("_manage reuse vm %s" % preVM.id) + self.log.info("_manage use vm %s" % preVM.id) else: + # xxxXXX??? strongly suspect this code path not work. + # After setting REUSE_VMS to False, job submissions don't run. preVM = self.preallocator.allocVM(job.vm.name) self.log.info("_manage allocate vm %s" % preVM.id) vmms = self.vmms[job.vm.vmms] # Create new vmms object diff --git a/jobQueue.py b/jobQueue.py index 3b53c1cd..9128ae86 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -207,16 +207,19 @@ def getNextPendingJobReuse(self, target_id=None): # if target_id is set, only interested in this id if target_id and target_id != id: continue - # Create a pool if necessary - if self.preallocator.poolSize(job.vm.name) == 0: - self.preallocator.update(job.vm, Config.POOL_SIZE) + + # Create or enlarge a pool if there is no free vm to use and + # the limit for pool is not reached yet + if self.preallocator.freePoolSize(job.vm.name) == 0 and \ + self.preallocator.poolSize(job.vm.name) < Config.POOL_SIZE: + self.preallocator.incrementPoolSize(job.vm, Config.POOL_ALLOC_INCREMENT) # If the job hasn't been assigned to a worker yet, see if there # is a free VM if (job.isNotAssigned()): vm = self.preallocator.allocVM(job.vm.name) - self.log.info("getNextPendingJobReuse alloc vm %s for %s" % (id, vm)) if vm: + self.log.info("getNextPendingJobReuse alloc vm %s to job %s" % (vm, id)) self.queueLock.release() return (id, vm) diff --git a/preallocator.py b/preallocator.py index 0f3270ba..f4421a7b 100644 --- a/preallocator.py +++ b/preallocator.py @@ -34,6 +34,30 @@ def poolSize(self, vmName): else: return len(self.machines.get(vmName)[0]) + def freePoolSize(self, vmName): + """ freePoolSize - returns the size of the vmName free pool, for external callers + """ + if vmName in self.machines.keys(): + return self.machines.get(vmName)[1].qsize() + else: + return 0 + + def incrementPoolSize(self, vm, delta): + """ + Called by jobQueue to create the pool and allcoate given number of vms + """ + + self.lock.acquire() + if vm.name not in self.machines.keys(): + self.machines.set(vm.name, [[], TangoQueue(vm.name)]) + # see comments in jobManager.py for the same call + self.machines.get(vm.name)[1].make_empty() + self.log.debug("Creating empty pool of %s instances" % (vm.name)) + self.lock.release() + + self.log.debug("incrementPoolSize: add %d new %s instances" % (delta, vm.name)) + threading.Thread(target=self.__create(vm, delta)).start() + def update(self, vm, num): """ update - Updates the number of machines of a certain type to be preallocated. From 76157c3e82b0919ad0dbf6b68ea811989d335fc1 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 30 Aug 2017 15:17:22 +0000 Subject: [PATCH 017/131] When job manager restarts, it now destroy the vm instances that not not in the free pools, instead of destroy all vms. --- jobManager.py | 12 +---------- tango.py | 54 +++++++++++++++++++++++++++++++++++++++++--------- vmms/ec2SSH.py | 30 ++++++++++++++++++++-------- 3 files changed, 68 insertions(+), 28 deletions(-) diff --git a/jobManager.py b/jobManager.py index 0f6c11d1..d96e5b97 100644 --- a/jobManager.py +++ b/jobManager.py @@ -136,16 +136,6 @@ def __manage(self): tango = TangoServer() tango.log.debug("Resetting Tango VMs") tango.resetTango(tango.preallocator.vmms) - for key in tango.preallocator.machines.keys(): - tango.preallocator.machines.set(key, [[], TangoQueue(key)]) - - # The above call sets the total pool empty. But the free pool which - # is a queue in redis, may not be empty. When the job manager restarts, - # resetting the free queue using the key doesn't change its content. - # Therefore we empty the queue, thus the free pool, to keep it consistent - # with the total pool. - tango.preallocator.machines.get(key)[1].make_empty() jobs = JobManager(tango.jobQueue) - - print("Starting the stand-alone Tango JobManager") + tango.log.info("Starting the stand-alone Tango JobManager") jobs.run() diff --git a/tango.py b/tango.py index ef153cc2..02c06869 100755 --- a/tango.py +++ b/tango.py @@ -216,25 +216,61 @@ def resetTango(self, vmms): effect is that also checks that each supported VMMS is actually running. """ + + # There are two cases this function is called: 1. Tango has a fresh start. + # Then we want to destroy all instances in Tango's name space. 2. Job + # Manager is restarted after a previous crash. Then we want to destroy + # the "busy" instances prior to the crash and leave the "free" onces intact. + self.log.debug("Received resetTango request.") try: - # For each supported VMM system, get the instances it knows about, - # and kill those in the current Tango name space. + # For each supported VMM system, get the instances it knows about + # in the current Tango name space and kill those not in free pools. for vmms_name in vmms: vobj = vmms[vmms_name] + + # Round up all instances in the free pools. + allFreeVMs = [] + for key in self.preallocator.machines.keys(): + freePool = self.preallocator.getPool(key)["free"] + for vmId in freePool: + vmName = vobj.instanceName(vmId, key) + allFreeVMs.append(vmName) + self.log.info("vms in all free pools: %s" % allFreeVMs) + + # For each in Tango's name space, destroy the onces in free pool. + # AND remove it from Tango's internal bookkeeping. vms = vobj.getVMs() self.log.debug("Pre-existing VMs: %s" % [vm.name for vm in vms]) - namelist = [] + destroyedList = [] + removedList = [] for vm in vms: if re.match("%s-" % Config.PREFIX, vm.name): - vobj.destroyVM(vm) - # Need a consistent abstraction for a vm between - # interfaces - namelist.append(vm.name) - if namelist: + + # Todo: should have an one-call interface to destroy the + # machine AND to keep the interval data consistent. + if vm.name not in allFreeVMs: + destroyedList.append(vm.name) + vobj.destroyVM(vm) + + # also remove it from "total" set of the pool + (prefix, vmId, poolName) = vm.name.split("-") + machine = self.preallocator.machines.get(poolName) + if not machine: # the pool may not exist + continue + + if int(vmId) in machine[0]: + removedList.append(vm.name) + machine[0].remove(int(vmId)) + self.preallocator.machines.set(poolName, machine) + + if destroyedList: self.log.warning("Killed these %s VMs on restart: %s" % - (vmms_name, namelist)) + (vmms_name, destroyedList)) + if removedList: + self.log.warning("Removed these %s VMs from their pools" % + (removedList)) for _, job in self.jobQueue.liveJobs.iteritems(): if not job.isNotAssigned(): diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index ec192d27..bf97fb15 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -222,6 +222,12 @@ def createSecurityGroup(self): except boto.exception.EC2ResponseError: pass + def getInstanceByReservationId(self, reservationId): + for inst in self.connection.get_all_instances(): + if inst.id == reservationId: + return inst.instances.pop() + return None + # # VMMS API functions # @@ -252,14 +258,25 @@ def initializeVM(self, vm): config.Config.DEFAULT_SECURITY_GROUP], instance_type=ec2instance['instance_type']) + newInstance = self.getInstanceByReservationId(reservation.id) + if newInstance: + # Assign name to EC2 instance + self.connection.create_tags([newInstance.id], {"Name": instanceName}) + self.log.info("new instance created %s" % newInstance) + else: + self.log.info("failed to find new instance for %s" % instanceName) + # Todo: should throw exception, etc. But without full understanding + # of the overall code structure, don't do anything for now. XXXxxx??? + return vm + # Wait for instance to reach 'running' state state = -1 start_time = time.time() while state is not config.Config.INSTANCE_RUNNING: - - for inst in self.connection.get_all_instances(): - if inst.id == reservation.id: - newInstance = inst.instances.pop() + newInstance = self.getInstanceByReservationId(reservation.id) + if not newInstance: # XXXxxx??? again, need error handling + self.log.info("failed to obtain status for %s" % instanceName) + return vm state = newInstance.state_code self.log.debug( @@ -283,9 +300,6 @@ def initializeVM(self, vm): # Save domain and id ssigned by EC2 in vm object vm.domain_name = newInstance.ip_address vm.ec2_id = newInstance.id - # Assign name to EC2 instance - self.connection.create_tags( - [newInstance.id], {"Name": instanceName}) self.log.debug("VM %s: %s" % (instanceName, newInstance)) return vm @@ -445,7 +459,7 @@ def destroyVM(self, vm): """ destroyVM - Removes a VM from the system """ - self.log.info("destroyVM: %s %s %s" % (vm.ec2_id, vm.name, vm.id)) + self.log.info("destroyVM: %s %s" % (vm.ec2_id, vm.name)) ret = self.connection.terminate_instances(instance_ids=[vm.ec2_id]) # delete dynamically created key if not self.useDefaultKeyPair: From 76749450a5fe0a87ee9e24b2ac1ca1fc9e4518bf Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 30 Aug 2017 15:21:34 +0000 Subject: [PATCH 018/131] Improve tool script that exercises job manager and the code beneath. --- tools/ec2Read.py | 64 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 6 deletions(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 85a28ba0..a18c5d12 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -1,11 +1,63 @@ -import os, sys +import os, sys, time, re sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from vmms.ec2SSH import Ec2SSH +from preallocator import Preallocator +from tangoObjects import TangoQueue +from tangoObjects import TangoMachine +from tango import TangoServer +from config import Config -# test vmms.ec2SSH's image extraction code +# test vmms.ec2SSH's image extraction code, etc # also serve as a template of accessing the ec2SSH vmms -vmms = Ec2SSH() -for key in vmms.img2ami: - image = vmms.img2ami[key] - print image["Name"], image["ImageId"], key +def destroyInstances(): + vms = ec2.getVMs() + for vm in vms: + if re.match("%s-" % Config.PREFIX, vm.name): + print "destroy", vm.name + ec2.destroyVM(vm) + +def listInstances(): + vms = ec2.getVMs() + print "aws instances" + for vm in vms: + print "vm", vm.name + print "list instances", len(vms) + for key in server.preallocator.machines.keys(): + pool = server.preallocator.getPool(key) + print "pool", key, pool["total"], pool["free"] + +def createInstances(num): + for imageName in pools: + (poolName, ext) = os.path.splitext(imageName) + print "creating", num, "for pool", poolName + vm = TangoMachine(vmms="ec2SSH", image=imageName) + server.preallocVM(vm, num) + +def destroyRedisPools(): + for key in server.preallocator.machines.keys(): + print "clean up pool", key + server.preallocator.machines.set(key, [[], TangoQueue(key)]) + server.preallocator.machines.get(key)[1].make_empty() + +def allocateVMs(): + freeList = [] + for key in server.preallocator.machines.keys(): + server.preallocator.allocVM(key) + total = server.preallocator.getPool(key)["total"] + free = server.preallocator.getPool(key)["free"] + print "after allocation", key, total, free + +server = TangoServer() +ec2 = server.preallocator.vmms["ec2SSH"] +pools = ec2.img2ami + +listInstances() +destroyInstances() +destroyRedisPools() +createInstances(2) +allocateVMs() +server.resetTango(server.preallocator.vmms) +listInstances() + +exit() From cbe01c26aa5ca90aaa2d1ba06fdf4b01b6530fbf Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 30 Aug 2017 15:26:27 +0000 Subject: [PATCH 019/131] Add ability to submit jobs for a given list of students. --- tools/run_jobs.py | 44 +++++++++++++++++++++++++++++++++++--------- tools/util.py | 22 +++++++++++++++++++--- 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/tools/run_jobs.py b/tools/run_jobs.py index 16edda96..7334787f 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -18,8 +18,8 @@ outputFiles = [] # if either is None, then all student works are submitted. -firstStudentNum = 0 -totalStudents = 1 +firstStudentNum = None +totalStudents = 6 for labIndex in cmdLine.args.indecies: if labIndex >= len(cfg.labs): @@ -28,8 +28,6 @@ for labIndex in cmdLine.args.indecies: lab = Lab(cfg, labIndex) - cmd.info() - cmd.open(lab) students = [] student2fileFullPath = {} @@ -61,13 +59,41 @@ i += 1 exit() - # submit all student works or a given range - if firstStudentNum is None or totalStudents is None: - firstStudentNum = 0 - totalStudents = len(students) + # submit all student works or a given range, or given student list + studentIndexList = [] + studentsToRun = [] + if cmdLine.args.students: + for studentToRun in cmdLine.args.students: + studentIndex = None + nMatches = 0 + index = 0 + for student in students: + if student.startswith(studentToRun): + studentIndex = index + nMatches += 1 + index += 1 + if nMatches != 1: + print "ERROR: no match or multiple matchs found for", studentToRun + exit() + studentIndexList.append(studentIndex) + studentsToRun.append(studentToRun) + + else: + if firstStudentNum is None or totalStudents is None: + firstStudentNum = 0 + totalStudents = len(students) + studentIndexList = list(index for index in range (firstStudentNum, firstStudentNum + totalStudents)) + print ("# Found %d students for lab %s" % (len(students), lab.name)) - print ("# Students index range %d..%d" % (firstStudentNum, totalStudents)) + if studentsToRun: + print ("# Students submissions %s %s" % studentsToRun) + else: + print ("# Students index starts at %d and total %d" % (firstStudentNum, totalStudents)) + exit() + + cmd.info() + cmd.open(lab) # load lab files cmd.upload(lab, lab.makefile) diff --git a/tools/util.py b/tools/util.py index 525a1b43..ab5dfa73 100644 --- a/tools/util.py +++ b/tools/util.py @@ -1,20 +1,30 @@ import subprocess, os, argparse class Config: - tangoDir = "/root/autolab-oneclick/server/Tango" + # tangoDir = "/root/autolab-oneclick/server/Tango" + tangoDir = "/mnt/charlene/Tango" cliCmd = "python " + tangoDir + "/clients/tango-cli.py" - tangoHostPort = "host-port 8600" + tangoHostPort = "host-port 8660" tangoIP = "" # output dir used by Tango for submissions - tangoFileRoot = "/root/autolab-oneclick/server/tango_courselabs" + # tangoFileRoot = "/root/autolab-oneclick/server/tango_courselabs" + tangoFileRoot = "/mnt/charlene/tango_courselabs" # course definition and handin files location course = "czang-exp" courseRoot = "/n/scratch/czang/f16/" labs = [ + # same test with different images, to test multiple pool (per image) handling {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "746.img"}, + {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "newPool.img"}, + {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "newPool.img"}, {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "newPool.img"}] + # when "list failures" is requested, the failed tests are listed from the output dir + # for the course/lab, unless the following is true. The the lab's handin from courseRoot + # is used. + examFailuresFromCourseRoot = False + class CommandLine(): def printLabs(self, name=None): print ("available tests:") @@ -31,6 +41,12 @@ def __init__(self, cfg): usage=self.printLabs()) parser.add_argument('indecies', metavar='index', type=int, nargs='+', help="index of a test") + parser.add_argument('-s', '--students', metavar='student', nargs='+', + help="student email") + parser.add_argument('-f', '--failures', action='store_true', + help="list failures") + parser.add_argument('-r', '--re_run', action='store_true', + help="re-run failed jobs") self.args = parser.parse_args() # represent attributes associated to a given lab From 46ceb59147756acffcc1046dcda416595fafb4a5 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 30 Aug 2017 16:23:14 -0400 Subject: [PATCH 020/131] Fix incomplete test script --- tools/run_jobs.py | 9 ++++----- tools/util.py | 9 +++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/run_jobs.py b/tools/run_jobs.py index 7334787f..52a305ae 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -18,8 +18,8 @@ outputFiles = [] # if either is None, then all student works are submitted. -firstStudentNum = None -totalStudents = 6 +firstStudentNum = 5 +totalStudents = 1 for labIndex in cmdLine.args.indecies: if labIndex >= len(cfg.labs): @@ -87,10 +87,9 @@ print ("# Found %d students for lab %s" % (len(students), lab.name)) if studentsToRun: - print ("# Students submissions %s %s" % studentsToRun) + print ("# Students submissions %s" % studentsToRun) else: print ("# Students index starts at %d and total %d" % (firstStudentNum, totalStudents)) - exit() cmd.info() cmd.open(lab) @@ -100,7 +99,7 @@ cmd.upload(lab, lab.autogradeTar) # load and run student submission - for i in range (firstStudentNum, firstStudentNum + totalStudents): + for i in studentIndexList: print ("\n# Submit for %s @ %s" % (students[i], lab.name)) cmd.upload(lab, student2file[students[i]]["full"]) cmd.addJob(lab, student2file[students[i]]) diff --git a/tools/util.py b/tools/util.py index ab5dfa73..1e9cfd90 100644 --- a/tools/util.py +++ b/tools/util.py @@ -2,17 +2,18 @@ class Config: # tangoDir = "/root/autolab-oneclick/server/Tango" - tangoDir = "/mnt/charlene/Tango" + tangoDir = "/nfs/autolab/pdl.cmu.edu/Tango" cliCmd = "python " + tangoDir + "/clients/tango-cli.py" - tangoHostPort = "host-port 8660" + tangoHostPort = "host-port 8600" tangoIP = "" # output dir used by Tango for submissions # tangoFileRoot = "/root/autolab-oneclick/server/tango_courselabs" - tangoFileRoot = "/mnt/charlene/tango_courselabs" + tangoFileRoot = "/nfs/autolab/pdl.cmu.edu/tango_courselabs" # course definition and handin files location course = "czang-exp" - courseRoot = "/n/scratch/czang/f16/" + # courseRoot = "/n/scratch/czang/f16/" + courseRoot = "/mnt/autolab/" labs = [ # same test with different images, to test multiple pool (per image) handling {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "746.img"}, From 7fef9856a990955065f8e3d0a32077c2d1ad89c5 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 31 Aug 2017 18:01:34 +0000 Subject: [PATCH 021/131] Check if the vm still exists before terminating. --- vmms/ec2SSH.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index bf97fb15..b170624c 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -459,6 +459,12 @@ def destroyVM(self, vm): """ destroyVM - Removes a VM from the system """ + # test if the instance still exists + reservations = self.connection.list_all_instances(instance_ids=[vm.ec2_id]) + if not reservatons: + self.log.info("destroyVM: instance non-exist %s %s" % (vm.ec2_id, vm.name)) + return [] + self.log.info("destroyVM: %s %s" % (vm.ec2_id, vm.name)) ret = self.connection.terminate_instances(instance_ids=[vm.ec2_id]) # delete dynamically created key From 0febf75a8fc74271f4fa4990a16c348d90ba0789 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 31 Aug 2017 18:58:06 +0000 Subject: [PATCH 022/131] Check output file for the missing "scores:" line and report them at the end of the run. --- tools/run_jobs.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/run_jobs.py b/tools/run_jobs.py index 52a305ae..f624b1a7 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -18,7 +18,7 @@ outputFiles = [] # if either is None, then all student works are submitted. -firstStudentNum = 5 +firstStudentNum = None totalStudents = 1 for labIndex in cmdLine.args.indecies: @@ -110,6 +110,7 @@ remainingFiles = list(outputFiles) numberRemaining = len(remainingFiles) loopDelay = 5 +badOutputFiles = [] while True: time.sleep(loopDelay) @@ -117,8 +118,13 @@ finishedFiles = [] for file in remainingFiles: if os.path.exists(file) and os.path.getmtime(file) > startTime: - print("Output %s is ready" % file) finishedFiles.append(file) + if "\"scores\":" not in open(file).read(): + badOutputFiles.append(file) + print("BAD output %s" % file) + os.system("tail -5 %s" % file) + else: + print("Output %s is ready" % file) remainingFiles = set(remainingFiles) - set(finishedFiles) nFinished = numberRemaining - len(remainingFiles) @@ -131,3 +137,8 @@ if numberRemaining == 0: print "All output files are counted for :))" break + +if badOutputFiles: + print("Found %d bad output files" % len(badOutputFiles)) + for f in badOutputFiles: + print("bad output: %s" % f) From 5b2bda8f7360043d9a49dcaf2cbb99fba2efe941 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 1 Sep 2017 15:17:15 +0000 Subject: [PATCH 023/131] Fix typos that prevents job manager to start. --- vmms/ec2SSH.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index b170624c..4be4df50 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -460,8 +460,8 @@ def destroyVM(self, vm): """ # test if the instance still exists - reservations = self.connection.list_all_instances(instance_ids=[vm.ec2_id]) - if not reservatons: + reservations = self.connection.get_all_instances(instance_ids=[vm.ec2_id]) + if not reservations: self.log.info("destroyVM: instance non-exist %s %s" % (vm.ec2_id, vm.name)) return [] From 4ce9534f5f7a471745fdaed728df73f687db5712 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 5 Sep 2017 17:33:15 +0000 Subject: [PATCH 024/131] Improvements to run_jobs: ability to run failed submissions, to dry run and to list failed submissions. --- tools/run_jobs.py | 49 +++++++++++++++++++-------- tools/util.py | 84 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 100 insertions(+), 33 deletions(-) diff --git a/tools/run_jobs.py b/tools/run_jobs.py index f624b1a7..db74f3a3 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -4,6 +4,7 @@ from util import Cmd from util import CommandLine from util import Lab +import util # drive student submissions to Tango. See ./util.py for preset configuratons. # the script finds course and labs at a specified location and submits work @@ -12,7 +13,7 @@ cfg = Config() cmdLine = CommandLine(cfg) -cmd = Cmd(cfg) +cmd = Cmd(cfg, cmdLine) startTime = time.mktime(datetime.datetime.now().timetuple()) outputFiles = [] @@ -26,11 +27,11 @@ print("lab index %d is out of range" % labIndex) exit(-1) +# run list of labs in sequence given on command line for labIndex in cmdLine.args.indecies: - lab = Lab(cfg, labIndex) + lab = Lab(cfg, cmdLine, labIndex) students = [] - student2fileFullPath = {} student2file = {} # get student handin files, the last submission for each student, @@ -52,18 +53,25 @@ student2file[email] = studentFile # print the students and the indices - if False: + if cmdLine.args.list_students: i = 0 for student in students: - print i, student + print i, student, student2file[student] i += 1 exit() - # submit all student works or a given range, or given student list + # submit all student works or a given range, or given student list, + # or all failed students studentIndexList = [] studentsToRun = [] - if cmdLine.args.students: - for studentToRun in cmdLine.args.students: + studentList = cmdLine.args.students + + # look for failures from output or from lab's handin (with "-H" option) + if cmdLine.args.re_run or cmdLine.args.failures: + studentList = util.getRerunList(cfg, lab) + + if studentList: # for -s, -r or -f option + for studentToRun in studentList: studentIndex = None nMatches = 0 index = 0 @@ -84,12 +92,23 @@ totalStudents = len(students) studentIndexList = list(index for index in range (firstStudentNum, firstStudentNum + totalStudents)) + # run students in a given order + studentIndexList.sort() + studentsToRun.sort() - print ("# Found %d students for lab %s" % (len(students), lab.name)) - if studentsToRun: - print ("# Students submissions %s" % studentsToRun) + print ("# Found total %d student submissions for lab %s" % (len(students), lab.name)) + if cmdLine.args.failures: + print ("# %d failed submissions from %s" % (len(studentIndexList), lab.outputFileQuery)) + for index in studentIndexList: + print ("%3d: %s" % (index, students[index])) + exit() + + if cmdLine.args.verbose: + print ("# Students submissions: %d" % len(studentIndexList)) + for index in studentIndexList: + print ("%3d: %s" % (index, students[index])) else: - print ("# Students index starts at %d and total %d" % (firstStudentNum, totalStudents)) + print ("# Students to run: %d" % (len(studentIndexList))) cmd.info() cmd.open(lab) @@ -106,6 +125,10 @@ outputFiles.append(lab.outputDir + "/" + student2file[students[i]]["output"]) # end of main loop "cmdLine.args.indecies" +if cmdLine.args.dry_run: + print "\nDry run done" + exit() + print "\nNow waiting for output files..." remainingFiles = list(outputFiles) numberRemaining = len(remainingFiles) @@ -131,7 +154,7 @@ print("%d jobs finished in the last %d seconds" % (nFinished, loopDelay)) print("%d unfinished out of %d" % (len(remainingFiles), len(outputFiles))) now = time.mktime(datetime.datetime.now().timetuple()) - print("%s has passed\n" % (str(datetime.timedelta(seconds = now - startTime)))) + print("elapsed time: %s\n" % (str(datetime.timedelta(seconds = now - startTime)))) numberRemaining = len(remainingFiles) if numberRemaining == 0: diff --git a/tools/util.py b/tools/util.py index 1e9cfd90..d11d2619 100644 --- a/tools/util.py +++ b/tools/util.py @@ -1,30 +1,26 @@ -import subprocess, os, argparse +import subprocess, os, argparse, glob, re class Config: # tangoDir = "/root/autolab-oneclick/server/Tango" - tangoDir = "/nfs/autolab/pdl.cmu.edu/Tango" + tangoDir = "/mnt/charlene/Tango" cliCmd = "python " + tangoDir + "/clients/tango-cli.py" - tangoHostPort = "host-port 8600" + tangoHostPort = "host-port 8660" tangoIP = "" # output dir used by Tango for submissions # tangoFileRoot = "/root/autolab-oneclick/server/tango_courselabs" - tangoFileRoot = "/nfs/autolab/pdl.cmu.edu/tango_courselabs" + tangoFileRoot = "/mnt/charlene/tango_courselabs" # course definition and handin files location course = "czang-exp" # courseRoot = "/n/scratch/czang/f16/" - courseRoot = "/mnt/autolab/" + courseRoot = "/n/scratch/czang/f16/" labs = [ # same test with different images, to test multiple pool (per image) handling {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "746.img"}, {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "newPool.img"}, {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "newPool.img"}, {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "newPool.img"}] - - # when "list failures" is requested, the failed tests are listed from the output dir - # for the course/lab, unless the following is true. The the lab's handin from courseRoot - # is used. - examFailuresFromCourseRoot = False +# end of class Config class CommandLine(): def printLabs(self, name=None): @@ -43,16 +39,25 @@ def __init__(self, cfg): parser.add_argument('indecies', metavar='index', type=int, nargs='+', help="index of a test") parser.add_argument('-s', '--students', metavar='student', nargs='+', - help="student email") + help="student emails (can be partial)") parser.add_argument('-f', '--failures', action='store_true', - help="list failures") + help="exam failures") parser.add_argument('-r', '--re_run', action='store_true', help="re-run failed jobs") + parser.add_argument('-H', '--handin_records', action='store_true', + help="exam failures or re-run jobs from handin records") + parser.add_argument('-l', '--list_students', action='store_true', + help="list student submissions") + parser.add_argument('-d', '--dry_run', action='store_true', + help="dry_run") + parser.add_argument('-v', '--verbose', action='store_true', + help="more info") self.args = parser.parse_args() +# end of class CmdLine # represent attributes associated to a given lab class Lab: - def __init__(self, cfg, labIndex): + def __init__(self, cfg, cmdLine, labIndex): self.cfg = cfg self.name = cfg.labs[labIndex]["name"] self.handinSuffix = cfg.labs[labIndex]["handinSuffix"] @@ -65,14 +70,19 @@ def __init__(self, cfg, labIndex): "handin", "*" + self.handinSuffix]) self.outputDir = None - if cfg.tangoFileRoot: - self.outputDir = "/".join([cfg.tangoFileRoot, - "test-" + self.courseLab, - "output"]) + self.outputDir = "/".join([cfg.tangoFileRoot, + "test-" + self.courseLab, + "output"]) + self.outputFileQuery = self.outputDir + "/*" + self.name + ".txt" + if cmdLine.args.handin_records: + self.outputFileQuery = self.courseLabDir + "/handin/*" + self.name + "_autograde.txt" + print "EXAM FAILURES from", self.outputFileQuery +# end of class Lab class Cmd: - def __init__(self, cfg): + def __init__(self, cfg, cmdLine): self.cfg = cfg + self.cmdLine = cmdLine outBytes = subprocess.check_output(["ps", "-auxw"]) for line in outBytes.decode("utf-8").split("\n"): if cfg.tangoHostPort in line: @@ -91,8 +101,11 @@ def __init__(self, cfg): #end of __init__ def run(self, cmd): # an internal util function - print "EXEC tango-cli", cmd - os.system(self.basic + cmd) + if self.cmdLine.args.dry_run: + print "DRY-RUN tango-cli", cmd + else: + print "EXEC tango-cli", cmd + os.system(self.basic + cmd) print "=======================================" def info(self): @@ -118,3 +131,34 @@ def addJob(self, lab, studentFile): def poll(self, lab, studentFile): myCmd = " --poll -l " + lab.courseLab self.run(myCmd + " --outputFile " + studentFile["output"]) +# end of class Cmd + +# =================== stand alone functions ====================== + +# get student handin files or output files, assuming file names start with student email +def getStudent2file(lab, fileQuery): + files = sorted(glob.glob(lab.outputFileQuery)) # files are sorted by student email + students = [] + student2file = {} + student2version = {} + + for f in files: + baseName = f.split("/").pop() + matchObj = re.match(r'(.*)_([0-9]+)_(.*)', baseName, re.M|re.I) + (email, version) = (matchObj.group(1), matchObj.group(2)) + if email not in students: + students.append(email) + if email not in student2version or version > student2version[email]: + student2version[email] = version + student2file[email] = f + return (students, student2file) + +def getRerunList(cfg, lab): + (students, student2file) = getStudent2file(lab, lab.outputFileQuery) + + failedStudents = [] + for s in students: + if "\"scores\":" not in open(student2file[s]).read(): + failedStudents.append(s) + + return failedStudents From 02964602a9c56a019ffd02db6c1dbad762d99acd Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 5 Sep 2017 18:05:24 +0000 Subject: [PATCH 025/131] Add a separate config file for run_jobs to separate config settings from logic changes. --- tools/config_for_run_jobs.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 tools/config_for_run_jobs.py diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py new file mode 100644 index 00000000..9cbce094 --- /dev/null +++ b/tools/config_for_run_jobs.py @@ -0,0 +1,36 @@ +# This is a config file for run_jobs.py. +# Change the file to fit your environment. +# Please do NOT commit your changes unless +# 1. There is a need for more configuration settings and +# 2. You have made it known to Xiaolin Charlene Zang. + +class Config: + # The settings are listed in the order of most-likly a changed is needed + # to the least-likely. + + # YOUR course name + course = "czang-exp" + + # YOUR root dir for course/lab definitions and handin + courseRoot = "/n/scratch/czang/f16/" + + # YOUR lab definitions. The index of the lab is given to run_job.py + labs = [ + {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "746.img"}, + {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "newPool.img"}, + {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "newPool.img"}] + + # YOUR Tango container's root dir for submissions and output + tangoFileRoot = "/mnt/charlene/tango_courselabs" + + # YOUR Tango repo root (cloned from Autolab github) + tangoDir = "/mnt/charlene/Tango" + + # Sometimes multiple experimental Tango containers are run on one machine. + # They are identified by different ports. + tangoHostPort = "host-port 8600" + + # IP of the tango container is usually computed automatically + tangoIP = "" + +# end of class Config From b1aa4baa3fa4a74ed953e642bbbcd44c872fbd35 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 5 Sep 2017 18:08:57 +0000 Subject: [PATCH 026/131] Separate "class Config" from util.py. --- tools/run_jobs.py | 2 +- tools/util.py | 24 +----------------------- 2 files changed, 2 insertions(+), 24 deletions(-) diff --git a/tools/run_jobs.py b/tools/run_jobs.py index db74f3a3..f790a10c 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -1,6 +1,6 @@ import os, re, glob, datetime, time -from util import Config +from config_for_run_jobs import Config from util import Cmd from util import CommandLine from util import Lab diff --git a/tools/util.py b/tools/util.py index d11d2619..4c393515 100644 --- a/tools/util.py +++ b/tools/util.py @@ -1,27 +1,5 @@ import subprocess, os, argparse, glob, re -class Config: - # tangoDir = "/root/autolab-oneclick/server/Tango" - tangoDir = "/mnt/charlene/Tango" - cliCmd = "python " + tangoDir + "/clients/tango-cli.py" - tangoHostPort = "host-port 8660" - tangoIP = "" - # output dir used by Tango for submissions - # tangoFileRoot = "/root/autolab-oneclick/server/tango_courselabs" - tangoFileRoot = "/mnt/charlene/tango_courselabs" - - # course definition and handin files location - course = "czang-exp" - # courseRoot = "/n/scratch/czang/f16/" - courseRoot = "/n/scratch/czang/f16/" - labs = [ - # same test with different images, to test multiple pool (per image) handling - {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "746.img"}, - {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "newPool.img"}, - {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "newPool.img"}, - {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "newPool.img"}] -# end of class Config - class CommandLine(): def printLabs(self, name=None): print ("available tests:") @@ -94,7 +72,7 @@ def __init__(self, cfg, cmdLine): print "ERROR: Cannot find tango server IP" exit(-1) - self.basic = cfg.cliCmd + self.basic = "python " + cfg.tangoDir + "/clients/tango-cli.py" self.basic += " -s " + cfg.tangoIP + " -P 8600" + " -k test" print "CMD BASE:", self.basic From 9be308f1728029f3b2d27d321c69b8220643d6c2 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 5 Sep 2017 18:31:54 +0000 Subject: [PATCH 027/131] Move student submission range into config file. --- tools/config_for_run_jobs.py | 6 ++++++ tools/run_jobs.py | 7 +++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py index 9cbce094..8188e219 100644 --- a/tools/config_for_run_jobs.py +++ b/tools/config_for_run_jobs.py @@ -20,6 +20,12 @@ class Config: {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "newPool.img"}, {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "newPool.img"}] + # Range of student submissions to run (sorted by student emails) + # If either is None, all student submissions are run, unless + # -r, -f, or -s is given to run_jobs. + firstStudentNum = None # start from index 3 + totalStudents = 1 # run one student + # YOUR Tango container's root dir for submissions and output tangoFileRoot = "/mnt/charlene/tango_courselabs" diff --git a/tools/run_jobs.py b/tools/run_jobs.py index f790a10c..8017d3c7 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -18,10 +18,6 @@ startTime = time.mktime(datetime.datetime.now().timetuple()) outputFiles = [] -# if either is None, then all student works are submitted. -firstStudentNum = None -totalStudents = 1 - for labIndex in cmdLine.args.indecies: if labIndex >= len(cfg.labs): print("lab index %d is out of range" % labIndex) @@ -66,6 +62,9 @@ studentsToRun = [] studentList = cmdLine.args.students + firstStudentNum = cfg.firstStudentNum + totalStudents = cfg.totalStudents + # look for failures from output or from lab's handin (with "-H" option) if cmdLine.args.re_run or cmdLine.args.failures: studentList = util.getRerunList(cfg, lab) From a9e29838d9fed8dc7fa2ee89ac244ec470d42729 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 5 Sep 2017 18:33:53 +0000 Subject: [PATCH 028/131] Correct a typo. --- tools/config_for_run_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py index 8188e219..3448c154 100644 --- a/tools/config_for_run_jobs.py +++ b/tools/config_for_run_jobs.py @@ -23,7 +23,7 @@ class Config: # Range of student submissions to run (sorted by student emails) # If either is None, all student submissions are run, unless # -r, -f, or -s is given to run_jobs. - firstStudentNum = None # start from index 3 + firstStudentNum = 3 # start from index 3 (set to None for all students) totalStudents = 1 # run one student # YOUR Tango container's root dir for submissions and output From e0b5253e5aa804c5d065d81a74c006e80eef263b Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 6 Sep 2017 04:54:42 +0000 Subject: [PATCH 029/131] Better check for output files with missing scores. --- tools/config_for_run_jobs.py | 10 +++++----- tools/run_jobs.py | 30 ++++++++++++++++++++++-------- tools/util.py | 2 +- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py index 3448c154..b6e1a98a 100644 --- a/tools/config_for_run_jobs.py +++ b/tools/config_for_run_jobs.py @@ -9,9 +9,9 @@ class Config: # to the least-likely. # YOUR course name - course = "czang-exp" + course = "your-name-experiment" - # YOUR root dir for course/lab definitions and handin + # YOUR root dir for course/lab definitions and handin (student submissions) courseRoot = "/n/scratch/czang/f16/" # YOUR lab definitions. The index of the lab is given to run_job.py @@ -27,10 +27,10 @@ class Config: totalStudents = 1 # run one student # YOUR Tango container's root dir for submissions and output - tangoFileRoot = "/mnt/charlene/tango_courselabs" + tangoFileRoot = "/root/autolab-oneclick/server/tango_courselabs" - # YOUR Tango repo root (cloned from Autolab github) - tangoDir = "/mnt/charlene/Tango" + # YOUR Tango repo root (cloned from xyzisinus' Autolab github) + tangoDir = "/h/myname/Tango" # Sometimes multiple experimental Tango containers are run on one machine. # They are identified by different ports. diff --git a/tools/run_jobs.py b/tools/run_jobs.py index 8017d3c7..d3f68965 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -33,7 +33,7 @@ # get student handin files, the last submission for each student, # and make a map from email to useful attrbutes - for file in sorted(glob.glob(lab.handinFilesQuery)): + for file in sorted(glob.glob(lab.handinFileQuery)): baseName = file.split("/").pop() matchObj = re.match(r'(.*)_[0-9]+_(.*)', baseName, re.M|re.I) email = matchObj.group(1) @@ -51,6 +51,8 @@ # print the students and the indices if cmdLine.args.list_students: i = 0 + print ("# %d student handin for lab %s from %s" % + (len(student2file), lab.name, lab.handinFileQuery)) for student in students: print i, student, student2file[student] i += 1 @@ -69,7 +71,7 @@ if cmdLine.args.re_run or cmdLine.args.failures: studentList = util.getRerunList(cfg, lab) - if studentList: # for -s, -r or -f option + if studentList or cmdLine.args.re_run or cmdLine.args.failures: for studentToRun in studentList: studentIndex = None nMatches = 0 @@ -97,10 +99,11 @@ print ("# Found total %d student submissions for lab %s" % (len(students), lab.name)) if cmdLine.args.failures: - print ("# %d failed submissions from %s" % (len(studentIndexList), lab.outputFileQuery)) + print ("# %d failed submissions for lab %s from %s" % + (len(studentIndexList), lab.name, lab.outputFileQuery)) for index in studentIndexList: print ("%3d: %s" % (index, students[index])) - exit() + continue # move onto next lab if cmdLine.args.verbose: print ("# Students submissions: %d" % len(studentIndexList)) @@ -109,6 +112,10 @@ else: print ("# Students to run: %d" % (len(studentIndexList))) + if len(studentIndexList) == 0: + print ("# No student submissions for lab %s" % lab.name) + continue # move onto next lab + cmd.info() cmd.open(lab) @@ -118,7 +125,7 @@ # load and run student submission for i in studentIndexList: - print ("\n# Submit for %s @ %s" % (students[i], lab.name)) + print ("\n# Submit %s for lab %s" % (students[i], lab.name)) cmd.upload(lab, student2file[students[i]]["full"]) cmd.addJob(lab, student2file[students[i]]) outputFiles.append(lab.outputDir + "/" + student2file[students[i]]["output"]) @@ -128,13 +135,13 @@ print "\nDry run done" exit() -print "\nNow waiting for output files..." +print("\nNow waiting for %d output files..." % len(outputFiles)) remainingFiles = list(outputFiles) numberRemaining = len(remainingFiles) loopDelay = 5 badOutputFiles = [] -while True: +while True and len(outputFiles) > 0: time.sleep(loopDelay) finishedFiles = [] @@ -161,6 +168,13 @@ break if badOutputFiles: - print("Found %d bad output files" % len(badOutputFiles)) + # not all bad files are really bad because the file copying may not + # be done when the error is reported, particularly if the file is long + realBadFiles = [] for f in badOutputFiles: + if "\"scores\":" not in open(f).read(): + realBadFiles.append(f) + + print("Found %d bad output files" % len(realBadFiles)) + for f in realBadFiles: print("bad output: %s" % f) diff --git a/tools/util.py b/tools/util.py index 4c393515..2a20ee47 100644 --- a/tools/util.py +++ b/tools/util.py @@ -44,7 +44,7 @@ def __init__(self, cfg, cmdLine, labIndex): self.courseLabDir = cfg.courseRoot + "/" + self.name self.makefile = self.courseLabDir + "/" + "autograde-Makefile" self.autogradeTar = self.courseLabDir + "/" + "autograde.tar" - self.handinFilesQuery = "/".join([self.courseLabDir, + self.handinFileQuery = "/".join([self.courseLabDir, "handin", "*" + self.handinSuffix]) self.outputDir = None From 79b7eadc6616940eeeb68ed846c225e80ad03f01 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 6 Sep 2017 14:23:55 +0000 Subject: [PATCH 030/131] remove trailing spaces. --- tools/config_for_run_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py index b6e1a98a..d281d7e9 100644 --- a/tools/config_for_run_jobs.py +++ b/tools/config_for_run_jobs.py @@ -28,7 +28,7 @@ class Config: # YOUR Tango container's root dir for submissions and output tangoFileRoot = "/root/autolab-oneclick/server/tango_courselabs" - + # YOUR Tango repo root (cloned from xyzisinus' Autolab github) tangoDir = "/h/myname/Tango" From 2176dfe64237c75531e083914f874f4f62fed7e8 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 7 Sep 2017 16:38:36 +0000 Subject: [PATCH 031/131] Move logging init in TangoServer to the beginning to capture all logs. Add aws auto scaling group name in config. --- config.template.py | 1 + tango.py | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/config.template.py b/config.template.py index 17bbf6fd..c8680639 100644 --- a/config.template.py +++ b/config.template.py @@ -153,6 +153,7 @@ class Config: EC2_REGION = '' EC2_USER_NAME = '' + EC2_AUTO_SCALING_GROUP_NAME = None # or the name of the auto scaling group DEFAULT_INST_TYPE = '' DEFAULT_SECURITY_GROUP = '' SECURITY_KEY_PATH = '' diff --git a/tango.py b/tango.py index 02c06869..fa1e6fca 100755 --- a/tango.py +++ b/tango.py @@ -52,6 +52,13 @@ class TangoServer: def __init__(self): self.daemon = True + + # init logging early, or some logging will be lost + logging.basicConfig( + filename=Config.LOGFILE, + format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", + level=Config.LOGLEVEL, + ) vmms = None if Config.VMMS_NAME == "tashiSSH": @@ -75,11 +82,6 @@ def __init__(self): # be initiated separately JobManager(self.jobQueue).start() - logging.basicConfig( - filename=Config.LOGFILE, - format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", - level=Config.LOGLEVEL, - ) self.start_time = time.time() self.log = logging.getLogger("TangoServer") self.log.info("Starting Tango server") From a5bc482adde9dbafe58a9ed8e191a43cc19f3358 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 14 Sep 2017 19:49:27 +0000 Subject: [PATCH 032/131] 1. Add ability to shrink preallocated pool. 2. preliminary code to attach instance to asw auto scaling group. 3. Error handling related to initializeVM(). 4. Consistency assurance in removeVM(). --- preallocator.py | 58 ++++++++++++++++++++++++++--- vmms/ec2SSH.py | 99 +++++++++++++++++++++++++++++++++++-------------- worker.py | 1 + 3 files changed, 124 insertions(+), 34 deletions(-) diff --git a/preallocator.py b/preallocator.py index f4421a7b..5c5814ed 100644 --- a/preallocator.py +++ b/preallocator.py @@ -42,11 +42,28 @@ def freePoolSize(self, vmName): else: return 0 + def decrementPoolSize(self, vm): + """ + Called by worker to shrink the pool, after returning a vm to free pool + """ + + if not (Config.POOL_SIZE_LOW_WATER_MARK and vm.name in self.machines.keys()): + return + + delta = self.freePoolSize(vm.name) - Config.POOL_SIZE_LOW_WATER_MARK + if delta > 0: + self.log.info("decrementPoolSize: remove %d vms from pool %s" % (delta, vm.name)) + for i in range(delta): + threading.Thread(target=self.__destroy(vm)).start() + def incrementPoolSize(self, vm, delta): """ Called by jobQueue to create the pool and allcoate given number of vms """ + if not delta: # POOL_ALLOC_INCREMENT may not be defined in Config + delta = 1 + self.lock.acquire() if vm.name not in self.machines.keys(): self.machines.set(vm.name, [[], TangoQueue(vm.name)]) @@ -142,16 +159,42 @@ def addVM(self, vm): self.machines.set(vm.name, machine) self.lock.release() + # Note: This function is called from removeVM() to handle the case when a vm + # is in free pool. In theory this should never happen but we want to ensure + # that. To solve the problem cleanly, preallocator should provide ONE primitive + # to add/remove a vm from both total and free pools, instead of two disjoint ones. + def removeFromFreePool(self, vm): + dieVM = None + self.lock.acquire() + size = self.machines.get(vm.name)[1].qsize() + self.log.info("removeFromFreePool: %s in pool %s" % (vm.id, vm.name)) + for i in range(size): # go through free pool + vm = self.machines.get(vm.name)[1].get_nowait() + # put it back into free pool, if not our vm + if vm.id != id: + self.machines.get(vm.name)[1].put(vm) + else: + self.log.info("removeFromFreePool: found %s in pool %s" % (vm.id, vm.name)) + # don't put this particular vm back to free pool, that is removal + self.lock.release() + def removeVM(self, vm): """ removeVM - remove a particular VM instance from the pool """ self.lock.acquire() machine = self.machines.get(vm.name) + if vm.id not in machine[0]: + self.log.error("removeVM: %s NOT found in pool" % (vm.id, vm.name)) + self.lock.release() + return + self.log.info("removeVM: %s" % vm.id) machine[0].remove(vm.id) self.machines.set(vm.name, machine) self.lock.release() + self.removeFromFreePool(vm) # also remove from free pool, just in case + def _getNextID(self): """ _getNextID - returns next ID to be used for a preallocated VM. Preallocated VM's have 4-digit ID numbers between 1000 @@ -181,7 +224,10 @@ def __create(self, vm, cnt): newVM = copy.deepcopy(vm) newVM.id = self._getNextID() self.log.debug("__create|calling initializeVM") - vmms.initializeVM(newVM) + ret = vmms.initializeVM(newVM) + if not ret: # ret is None when fails + self.log.debug("__create|failed initializeVM") + continue self.log.debug("__create|done with initializeVM") time.sleep(Config.CREATEVM_SECS) @@ -204,7 +250,7 @@ def __destroy(self, vm): self.lock.release() if dieVM: - self.log.info("__destroy: %s" % vm.id) + self.log.info("__destroy: %s" % dieVM.id) self.removeVM(dieVM) vmms = self.vmms[vm.vmms] vmms.safeDestroyVM(dieVM) @@ -219,7 +265,10 @@ def createVM(self, vm): newVM.id = self._getNextID() self.log.info("createVM|calling initializeVM") - vmms.initializeVM(newVM) + ret = vmms.initializeVM(newVM) + if not ret: + self.log.debug("createVM|failed initializeVM") + return self.log.info("createVM|done with initializeVM %s" % newVM.id) self.addVM(newVM) @@ -240,9 +289,6 @@ def destroyVM(self, vmName, id): self.lock.acquire() size = self.machines.get(vmName)[1].qsize() self.log.info("destroyVM: free:total pool %d:%d" % (size, len(self.machines.get(vmName)[0]))) - # xxxXXX??? the following code is questionable: It requires that - # all vms are free. Otherwise it doesn't do anything. Is this desired - # behavior? if (size == len(self.machines.get(vmName)[0])): for i in range(size): vm = self.machines.get(vmName)[1].get_nowait() diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 4be4df50..1e6373b6 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -99,6 +99,11 @@ def __init__(self, accessKeyId=None, accessKey=None): instance - Instance object that stores information about the VM created """ + + self.log = logging.getLogger("Ec2SSH-" + str(os.getpid())) + + self.log.info("init Ec2SSH") + self.ssh_flags = Ec2SSH._SSH_FLAGS if accessKeyId: self.connection = ec2.connect_to_region(config.Config.EC2_REGION, @@ -107,7 +112,6 @@ def __init__(self, accessKeyId=None, accessKey=None): else: self.connection = ec2.connect_to_region(config.Config.EC2_REGION) self.useDefaultKeyPair = True - self.log = logging.getLogger("Ec2SSH-" + str(os.getpid())) # Use boto3 to read images. Find the "Name" tag and use it as key to # build a map from "Name tag" to boto3's image structure. @@ -141,6 +145,23 @@ def __init__(self, accessKeyId=None, accessKey=None): if (len(ignoredAmis) > 0): self.log.info("Ignored amis %s due to lack of proper name tag" % str(ignoredAmis)) + # preliminary code for auto scaling group (configured by EC2_AUTO_SCALING_GROUP_NAME) + # Here we get the pointer to the group, if any. + # When an instance is created, it's attached to the group. + # When an instance is terminated, it's detached. + self.asg = None + self.auto_scaling_group = None + if config.Config.EC2_AUTO_SCALING_GROUP_NAME: + self.asg = boto3.client("autoscaling", config.Config.EC2_REGION) + groups = self.asg.describe_auto_scaling_groups(AutoScalingGroupNames=[config.Config.EC2_AUTO_SCALING_GROUP_NAME]) + if len(groups['AutoScalingGroups']) == 1: + self.auto_scaling_group = groups['AutoScalingGroups'][0] + self.log.info("Use aws auto scaling group %s" % config.Config.EC2_AUTO_SCALING_GROUP_NAME) + + instances = self.asg.describe_auto_scaling_instances()['AutoScalingInstances'] + else: + self.log.info("Cannot find auto scaling group %s" % config.Config.EC2_AUTO_SCALING_GROUP_NAME) + def instanceName(self, id, name): """ instanceName - Constructs a VM instance name. Always use this function when you need a VM instance name. Never generate @@ -232,15 +253,16 @@ def getInstanceByReservationId(self, reservationId): # VMMS API functions # def initializeVM(self, vm): - """ initializeVM - Tell EC2 to create a new VM instance. + """ initializeVM - Tell EC2 to create a new VM instance. return None on failure Returns a boto.ec2.instance.Instance object. """ # Create the instance and obtain the reservation + newInstance = None try: instanceName = self.instanceName(vm.id, vm.name) ec2instance = self.tangoMachineToEC2Instance(vm) - self.log.info("initiliazeVM: %s %s" % (instanceName, str(ec2instance))) + self.log.info("initializeVM: %s %s" % (instanceName, str(ec2instance))) # ensure that security group exists self.createSecurityGroup() if self.useDefaultKeyPair: @@ -250,44 +272,46 @@ def initializeVM(self, vm): self.key_pair_name = self.keyPairName(vm.id, vm.name) self.createKeyPair() - reservation = self.connection.run_instances( ec2instance['ami'], key_name=self.key_pair_name, security_groups=[ config.Config.DEFAULT_SECURITY_GROUP], instance_type=ec2instance['instance_type']) - newInstance = self.getInstanceByReservationId(reservation.id) if newInstance: # Assign name to EC2 instance self.connection.create_tags([newInstance.id], {"Name": instanceName}) self.log.info("new instance created %s" % newInstance) else: - self.log.info("failed to find new instance for %s" % instanceName) - # Todo: should throw exception, etc. But without full understanding - # of the overall code structure, don't do anything for now. XXXxxx??? - return vm + raise ValueError("cannot find new instance for %s" % instanceName) # Wait for instance to reach 'running' state - state = -1 start_time = time.time() - while state is not config.Config.INSTANCE_RUNNING: - newInstance = self.getInstanceByReservationId(reservation.id) - if not newInstance: # XXXxxx??? again, need error handling - self.log.info("failed to obtain status for %s" % instanceName) - return vm - - state = newInstance.state_code - self.log.debug( - "VM %s: Waiting to reach 'running' state. Current state: %s (%d)" % - (instanceName, newInstance.state, state)) - time.sleep(config.Config.TIMER_POLL_INTERVAL) + while True: elapsed_secs = time.time() - start_time - if (elapsed_secs > config.Config.INITIALIZEVM_TIMEOUT): - self.log.debug( - "VM %s: Did not reach 'running' state before timeout period of %d" % - (instanceName, config.Config.TIMER_POLL_INTERVAL)) + + newInstance = self.getInstanceByReservationId(reservation.id) + if not newInstance: + raise ValueError("cannot obtain aws instance for %s" % instanceName) + + if newInstance.state == "pending": + if elapsed_secs > config.Config.INITIALIZEVM_TIMEOUT: + raise ValueError("VM %s: timeout (%d seconds) before reaching 'running' state" % + (instanceName, config.Config.TIMER_POLL_INTERVAL)) + + self.log.debug("VM %s: Waiting to reach 'running' from 'pending'" % instanceName) + time.sleep(config.Config.TIMER_POLL_INTERVAL) + continue + + if newInstance.state == "running": + self.log.debug("VM %s: has reached 'running' state in %d seconds" % + (instanceName, elapsed_secs)) + break + + raise ValueError("VM %s: quit waiting when seeing state '%s' after %d seconds" % + (instanceName, newInstance.state, elapsed_secs)) + # end of while loop self.log.info( "VM %s | State %s | Reservation %s | Public DNS Name %s | Public IP Address %s" % @@ -297,6 +321,11 @@ def initializeVM(self, vm): newInstance.public_dns_name, newInstance.ip_address)) + if self.auto_scaling_group: + self.asg.attach_instances(InstanceIds=[newInstance.id], + AutoScalingGroupName=config.Config.EC2_AUTO_SCALING_GROUP_NAME) + self.log.info("attach new instance %s to auto scaling group" % newInstance.id) + # Save domain and id ssigned by EC2 in vm object vm.domain_name = newInstance.ip_address vm.ec2_id = newInstance.id @@ -305,7 +334,8 @@ def initializeVM(self, vm): except Exception as e: self.log.debug("initializeVM Failed: %s" % e) - + if newInstance: + self.connection.terminate_instances(instance_ids=[newInstance.id]) return None def waitVM(self, vm, max_secs): @@ -462,14 +492,27 @@ def destroyVM(self, vm): # test if the instance still exists reservations = self.connection.get_all_instances(instance_ids=[vm.ec2_id]) if not reservations: - self.log.info("destroyVM: instance non-exist %s %s" % (vm.ec2_id, vm.name)) - return [] + self.log.info("destroyVM: instance non-exist %s %s" % (vm.ec2_id, vm.name)) + return [] self.log.info("destroyVM: %s %s" % (vm.ec2_id, vm.name)) + ret = self.connection.terminate_instances(instance_ids=[vm.ec2_id]) # delete dynamically created key if not self.useDefaultKeyPair: self.deleteKeyPair() + + if self.auto_scaling_group: + response = self.asg.describe_auto_scaling_instances(InstanceIds=[vm.ec2_id], + MaxRecords=1) + if len(response['AutoScalingInstances']) == 1: + self.asg.detach_instances(InstanceIds=[vm.ec2_id], + AutoScalingGroupName=config.Config.EC2_AUTO_SCALING_GROUP_NAME, + ShouldDecrementDesiredCapacity=True) + self.log.info("detach instance %s %s from auto scaling group" % (vm.ec2_id, vm.name)) + else: + self.log.info("instance %s %s not in auto scaling group" % (vm.ec2_id, vm.name)) + return ret def safeDestroyVM(self, vm): diff --git a/worker.py b/worker.py index 3c9af433..12ec9784 100644 --- a/worker.py +++ b/worker.py @@ -52,6 +52,7 @@ def detachVM(self, return_vm=False, replace_vm=False): self.vmms.safeDestroyVM(self.job.vm) elif return_vm: self.preallocator.freeVM(self.job.vm) + self.preallocator.decrementPoolSize(self.job.vm) # may reduce size of pool else: self.vmms.safeDestroyVM(self.job.vm) if replace_vm: From b3a076ff0852a855690ceb9cf38bbaac265f5eb1 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 14 Sep 2017 19:55:59 +0000 Subject: [PATCH 033/131] Add ability to test decrementing pool size --- tools/ec2Read.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index a18c5d12..37fe4219 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -23,6 +23,7 @@ def listInstances(): for vm in vms: print "vm", vm.name print "list instances", len(vms) + print "pools", ec2.img2ami.keys() for key in server.preallocator.machines.keys(): pool = server.preallocator.getPool(key) print "pool", key, pool["total"], pool["free"] @@ -34,6 +35,14 @@ def createInstances(num): vm = TangoMachine(vmms="ec2SSH", image=imageName) server.preallocVM(vm, num) +def shrinkPools(): + for imageName in pools: + (poolName, ext) = os.path.splitext(imageName) + vm = TangoMachine(vmms="ec2SSH", image=imageName) + vm.name = poolName + print "shrink pool", vm.name + server.preallocator.decrementPoolSize(vm) + def destroyRedisPools(): for key in server.preallocator.machines.keys(): print "clean up pool", key @@ -53,11 +62,15 @@ def allocateVMs(): pools = ec2.img2ami listInstances() +exit() destroyInstances() destroyRedisPools() createInstances(2) +shrinkPools() +exit() + allocateVMs() +exit() server.resetTango(server.preallocator.vmms) listInstances() -exit() From d896b360f6c8111a6be81df89bd43917519dd581 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 15 Sep 2017 15:31:45 +0000 Subject: [PATCH 034/131] Add vm pool low water mark. --- config.template.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/config.template.py b/config.template.py index c8680639..5766bb17 100644 --- a/config.template.py +++ b/config.template.py @@ -106,10 +106,13 @@ class Config: CREATEVM_SECS = 1 # Default vm pool size - POOL_SIZE = 2 + POOL_SIZE = 10 + + # vm pool reserve size. If set, free pool size is maintained at the level. + POOL_SIZE_LOW_WATER_MARK = None # optional, can be None # Default increment step when enlarging vm pool - POOL_ALLOC_INCREMENT = 2 + POOL_ALLOC_INCREMENT = 2 # can be None, which is treated as 1 # Optionally log finer-grained timing information LOG_TIMING = False From 780557749cd14c272aad6a7ea4d5e04ff2ac18ed Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 15 Sep 2017 19:36:08 +0000 Subject: [PATCH 035/131] test for existence of the newly config variables for backward compatibility. --- jobQueue.py | 5 ++++- preallocator.py | 6 ++---- vmms/ec2SSH.py | 10 ++++++---- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/jobQueue.py b/jobQueue.py index 9128ae86..5b53b4f5 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -212,7 +212,10 @@ def getNextPendingJobReuse(self, target_id=None): # the limit for pool is not reached yet if self.preallocator.freePoolSize(job.vm.name) == 0 and \ self.preallocator.poolSize(job.vm.name) < Config.POOL_SIZE: - self.preallocator.incrementPoolSize(job.vm, Config.POOL_ALLOC_INCREMENT) + increment = 1 + if hasattr(Config, 'POOL_ALLOC_INCREMENT') and Config.POOL_ALLOC_INCREMENT: + increment = Config.POOL_ALLOC_INCREMENT + self.preallocator.incrementPoolSize(job.vm, increment) # If the job hasn't been assigned to a worker yet, see if there # is a free VM diff --git a/preallocator.py b/preallocator.py index 5c5814ed..33718c07 100644 --- a/preallocator.py +++ b/preallocator.py @@ -47,7 +47,8 @@ def decrementPoolSize(self, vm): Called by worker to shrink the pool, after returning a vm to free pool """ - if not (Config.POOL_SIZE_LOW_WATER_MARK and vm.name in self.machines.keys()): + if not (hasattr(Config, 'POOL_SIZE_LOW_WATER_MARK') and + Config.POOL_SIZE_LOW_WATER_MARK and vm.name in self.machines.keys()): return delta = self.freePoolSize(vm.name) - Config.POOL_SIZE_LOW_WATER_MARK @@ -61,9 +62,6 @@ def incrementPoolSize(self, vm, delta): Called by jobQueue to create the pool and allcoate given number of vms """ - if not delta: # POOL_ALLOC_INCREMENT may not be defined in Config - delta = 1 - self.lock.acquire() if vm.name not in self.machines.keys(): self.machines.set(vm.name, [[], TangoQueue(vm.name)]) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 1e6373b6..9981428f 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -151,12 +151,14 @@ def __init__(self, accessKeyId=None, accessKey=None): # When an instance is terminated, it's detached. self.asg = None self.auto_scaling_group = None - if config.Config.EC2_AUTO_SCALING_GROUP_NAME: + self.auto_scaling_group_name = None + if hasattr(config.Config, 'EC2_AUTO_SCALING_GROUP_NAME') and config.Config.EC2_AUTO_SCALING_GROUP_NAME: self.asg = boto3.client("autoscaling", config.Config.EC2_REGION) groups = self.asg.describe_auto_scaling_groups(AutoScalingGroupNames=[config.Config.EC2_AUTO_SCALING_GROUP_NAME]) if len(groups['AutoScalingGroups']) == 1: self.auto_scaling_group = groups['AutoScalingGroups'][0] - self.log.info("Use aws auto scaling group %s" % config.Config.EC2_AUTO_SCALING_GROUP_NAME) + self.auto_scaling_group_name = config.Config.EC2_AUTO_SCALING_GROUP_NAME + self.log.info("Use aws auto scaling group %s" % self.auto_scaling_group_name) instances = self.asg.describe_auto_scaling_instances()['AutoScalingInstances'] else: @@ -323,7 +325,7 @@ def initializeVM(self, vm): if self.auto_scaling_group: self.asg.attach_instances(InstanceIds=[newInstance.id], - AutoScalingGroupName=config.Config.EC2_AUTO_SCALING_GROUP_NAME) + AutoScalingGroupName=self.auto_scaling_group_name) self.log.info("attach new instance %s to auto scaling group" % newInstance.id) # Save domain and id ssigned by EC2 in vm object @@ -507,7 +509,7 @@ def destroyVM(self, vm): MaxRecords=1) if len(response['AutoScalingInstances']) == 1: self.asg.detach_instances(InstanceIds=[vm.ec2_id], - AutoScalingGroupName=config.Config.EC2_AUTO_SCALING_GROUP_NAME, + AutoScalingGroupName=self.auto_scaling_group_name, ShouldDecrementDesiredCapacity=True) self.log.info("detach instance %s %s from auto scaling group" % (vm.ec2_id, vm.name)) else: From aaa07ebc45a0b72cd73540ad1026155e0c952b64 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 11 Oct 2017 17:58:48 +0000 Subject: [PATCH 036/131] Fix a condition that checks low_water_mark config variable. --- preallocator.py | 2 +- restful-tango/tangoREST.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/preallocator.py b/preallocator.py index 33718c07..67be4e24 100644 --- a/preallocator.py +++ b/preallocator.py @@ -48,7 +48,7 @@ def decrementPoolSize(self, vm): """ if not (hasattr(Config, 'POOL_SIZE_LOW_WATER_MARK') and - Config.POOL_SIZE_LOW_WATER_MARK and vm.name in self.machines.keys()): + Config.POOL_SIZE_LOW_WATER_MARK >= 0 and vm.name in self.machines.keys()): return delta = self.freePoolSize(vm.name) - Config.POOL_SIZE_LOW_WATER_MARK diff --git a/restful-tango/tangoREST.py b/restful-tango/tangoREST.py index c61dff1c..602ee885 100644 --- a/restful-tango/tangoREST.py +++ b/restful-tango/tangoREST.py @@ -369,11 +369,11 @@ def jobs(self, key, deadJobs): if (int(deadJobs) == 0): jobs = self.tango.getJobs(0) self.log.debug( - "Retrieved live jobs (deadJobs = %s)" % deadJobs) + "Retrieved %d live jobs (deadJobs = %s)" % (len(jobs), deadJobs)) elif (int(deadJobs) == 1): jobs = self.tango.getJobs(-1) self.log.debug( - "Retrieved dead jobs (deadJobs = %s)" % deadJobs) + "Retrieved %d dead jobs (deadJobs = %s)" % (len(jobs), deadJobs)) result['jobs'] = list() for job in jobs: result['jobs'].append(self.convertTangoJobObj(job)) From 965b2815bb62fd4be521274828adb12e2e39babd Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 11 Oct 2017 18:46:49 +0000 Subject: [PATCH 037/131] A useful suggestive setting for pool low water mark. --- config.template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.template.py b/config.template.py index 5766bb17..b508031e 100644 --- a/config.template.py +++ b/config.template.py @@ -109,7 +109,7 @@ class Config: POOL_SIZE = 10 # vm pool reserve size. If set, free pool size is maintained at the level. - POOL_SIZE_LOW_WATER_MARK = None # optional, can be None + POOL_SIZE_LOW_WATER_MARK = 5 # optional, can be None # Default increment step when enlarging vm pool POOL_ALLOC_INCREMENT = 2 # can be None, which is treated as 1 From 7432c4b206ab1bf1731021ac84448f9b3506a5f7 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 12 Oct 2017 17:50:08 +0000 Subject: [PATCH 038/131] Add -j flag to list all jobs. Exam output file in next iteration of the loop to make sure that the file is fully copied. --- tools/run_jobs.py | 29 +++++++++++++++++++---------- tools/util.py | 22 ++++++++++++++++++++-- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/tools/run_jobs.py b/tools/run_jobs.py index d3f68965..f4426a36 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -15,6 +15,11 @@ cmdLine = CommandLine(cfg) cmd = Cmd(cfg, cmdLine) +if cmdLine.args.jobs: + cmd.jobs() + exit() + + startTime = time.mktime(datetime.datetime.now().timetuple()) outputFiles = [] @@ -140,22 +145,26 @@ numberRemaining = len(remainingFiles) loopDelay = 5 badOutputFiles = [] +justFinishedFiles = [] while True and len(outputFiles) > 0: time.sleep(loopDelay) - finishedFiles = [] + # if we check the output file for scores as soon as it shows up, + # the file may not fulled copied. So we check the files found in + # the last round. + for file in justFinishedFiles: + if "\"scores\":" not in open(file).read(): + badOutputFiles.append(file) + print("output missing scores: %s" % file) + else: + print("Output ready: %s" % file) + + justFinishedFiles = [] for file in remainingFiles: if os.path.exists(file) and os.path.getmtime(file) > startTime: - finishedFiles.append(file) - if "\"scores\":" not in open(file).read(): - badOutputFiles.append(file) - print("BAD output %s" % file) - os.system("tail -5 %s" % file) - else: - print("Output %s is ready" % file) - - remainingFiles = set(remainingFiles) - set(finishedFiles) + justFinishedFiles.append(file) + remainingFiles = set(remainingFiles) - set(justFinishedFiles) nFinished = numberRemaining - len(remainingFiles) print("%d jobs finished in the last %d seconds" % (nFinished, loopDelay)) print("%d unfinished out of %d" % (len(remainingFiles), len(outputFiles))) diff --git a/tools/util.py b/tools/util.py index 2a20ee47..cf67c634 100644 --- a/tools/util.py +++ b/tools/util.py @@ -1,4 +1,4 @@ -import subprocess, os, argparse, glob, re +import subprocess, os, argparse, glob, re, json class CommandLine(): def printLabs(self, name=None): @@ -26,6 +26,8 @@ def __init__(self, cfg): help="exam failures or re-run jobs from handin records") parser.add_argument('-l', '--list_students', action='store_true', help="list student submissions") + parser.add_argument('-j', '--jobs', action='store_true', + help="list all jobs (test index ignored)") parser.add_argument('-d', '--dry_run', action='store_true', help="dry_run") parser.add_argument('-v', '--verbose', action='store_true', @@ -84,7 +86,11 @@ def run(self, cmd): # an internal util function else: print "EXEC tango-cli", cmd os.system(self.basic + cmd) - print "=======================================" + print "=======================================" + + def runAndOutput(self, cmd): + print "EXEC-CAPTURE tango-cli", cmd + return os.popen(self.basic + cmd).read() def info(self): self.run(" --info") @@ -109,6 +115,18 @@ def addJob(self, lab, studentFile): def poll(self, lab, studentFile): myCmd = " --poll -l " + lab.courseLab self.run(myCmd + " --outputFile " + studentFile["output"]) + + def jobs(self): + result = json.loads(self.runAndOutput(" --jobs ").splitlines()[1]) + nJobs = len(result["jobs"]) + print "Waiting/running jobs:", nJobs + print json.dumps(result, indent=2, sort_keys=True) + print "=======================================" + + result = json.loads(self.runAndOutput(" --jobs --deadJobs 1 ").splitlines()[1]) + nJobs = len(result["jobs"]) + print "Completed jobs:", nJobs + print json.dumps(result, indent=2, sort_keys=True) # end of class Cmd # =================== stand alone functions ====================== From d2c86e32a8b039e03027c8d2d89ae11ce4855494 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 12 Oct 2017 17:51:19 +0000 Subject: [PATCH 039/131] When low water mark is zero, the free_destroy vm op works now. --- preallocator.py | 35 ++++++++++++++++++++++++++++------- worker.py | 2 +- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/preallocator.py b/preallocator.py index 67be4e24..242b56ac 100644 --- a/preallocator.py +++ b/preallocator.py @@ -125,26 +125,47 @@ def allocVM(self, vmName): return vm + def addToFreePool(self, vm): + """ addToFreePool - Returns a VM instance to the free list + """ + + self.lock.acquire() + machine = self.machines.get(vm.name) + self.log.info("addToFreePool: add %s to free pool" % vm.id) + machine[1].put(vm) + self.machines.set(vm.name, machine) + self.lock.release() + def freeVM(self, vm): """ freeVM - Returns a VM instance to the free list """ # Sanity check: Return a VM to the free list only if it is # still a member of the pool. not_found = False + should_destroy = False self.lock.acquire() if vm and vm.id in self.machines.get(vm.name)[0]: - machine = self.machines.get(vm.name) - self.log.info("freeVM: return %s to free pool" % vm.id) - machine[1].put(vm) - self.machines.set(vm.name, machine) + if (hasattr(Config, 'POOL_SIZE_LOW_WATER_MARK') and + Config.POOL_SIZE_LOW_WATER_MARK >= 0 and + vm.name in self.machines.keys() and + self.freePoolSize(vm.name) >= Config.POOL_SIZE_LOW_WATER_MARK): + self.log.info("freeVM: over low water mark. will destroy %s" % vm.id) + should_destroy = True + else: + machine = self.machines.get(vm.name) + self.log.info("freeVM: return %s to free pool" % vm.id) + machine[1].put(vm) + self.machines.set(vm.name, machine) else: + self.log.info("freeVM: not found in pool %s. will destroy %s" % (vm.name, vm.id)) not_found = True self.lock.release() # The VM is no longer in the pool. - if not_found: + if not_found or should_destroy: self.log.info("freeVM: will destroy %s" % vm.id) vmms = self.vmms[vm.vmms] + self.removeVM(vm) vmms.safeDestroyVM(vm) def addVM(self, vm): @@ -230,7 +251,7 @@ def __create(self, vm, cnt): time.sleep(Config.CREATEVM_SECS) self.addVM(newVM) - self.freeVM(newVM) + self.addToFreePool(newVM) self.log.debug("__create: Added vm %s to pool %s " % (newVM.id, newVM.name)) @@ -270,7 +291,7 @@ def createVM(self, vm): self.log.info("createVM|done with initializeVM %s" % newVM.id) self.addVM(newVM) - self.freeVM(newVM) + self.addToFreePool(newVM) self.log.debug("createVM: Added vm %s to pool %s" % (newVM.id, newVM.name)) diff --git a/worker.py b/worker.py index 12ec9784..f58e378c 100644 --- a/worker.py +++ b/worker.py @@ -51,8 +51,8 @@ def detachVM(self, return_vm=False, replace_vm=False): if self.job.accessKeyId: self.vmms.safeDestroyVM(self.job.vm) elif return_vm: + # put vm into free pool. may destroy it if free pool is over low water mark self.preallocator.freeVM(self.job.vm) - self.preallocator.decrementPoolSize(self.job.vm) # may reduce size of pool else: self.vmms.safeDestroyVM(self.job.vm) if replace_vm: From 9736236ceb5cdb8d1b400259a8b1bad7ab627baa Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 13 Oct 2017 15:41:25 +0000 Subject: [PATCH 040/131] Add option to direct query to diferent redis servers with port number. --- tools/config_for_run_jobs.py | 7 ++++++- tools/ec2Read.py | 12 ++++++++++++ tools/redisRead.py | 15 --------------- 3 files changed, 18 insertions(+), 16 deletions(-) delete mode 100644 tools/redisRead.py diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py index d281d7e9..2fa03a77 100644 --- a/tools/config_for_run_jobs.py +++ b/tools/config_for_run_jobs.py @@ -38,5 +38,10 @@ class Config: # IP of the tango container is usually computed automatically tangoIP = "" - + + # Redis port. Sometimes we have two redis running, each support a Tango + # In such case a different forwarding port is assigned to it. + # Note: This variable is used by ec2Read.py only. + redisPort = 6379 # standard + # end of class Config diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 37fe4219..44576f72 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -6,6 +6,9 @@ from tangoObjects import TangoMachine from tango import TangoServer from config import Config +import tangoObjects +import config_for_run_jobs +import redis # test vmms.ec2SSH's image extraction code, etc # also serve as a template of accessing the ec2SSH vmms @@ -57,6 +60,15 @@ def allocateVMs(): free = server.preallocator.getPool(key)["free"] print "after allocation", key, total, free + +# When a host has two Tango containers (for experiment), there are two +# redis servers, too. They differ by the forwarding port number, which +# is defined in config_for_run_jobs.py. To select the redis server, +# We get the connection here and pass it into tangoObjects +redisConnection = redis.StrictRedis( + host=Config.REDIS_HOSTNAME, port=config_for_run_jobs.Config.redisPort, db=0) +tangoObjects.getRedisConnection(connection=redisConnection) + server = TangoServer() ec2 = server.preallocator.vmms["ec2SSH"] pools = ec2.img2ami diff --git a/tools/redisRead.py b/tools/redisRead.py deleted file mode 100644 index 2a0b37e6..00000000 --- a/tools/redisRead.py +++ /dev/null @@ -1,15 +0,0 @@ -import sys, os -# search parent dirs for importable packages -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from tangoObjects import TangoDictionary - -# list all "machines" pools and get the total and free sets for each pool -# it also serves as a template of extracting contents from redis - -machines = TangoDictionary("machines") -print "pools", machines.keys() - -for poolName in machines.keys(): - print "pool:", poolName - print "total:", machines.get(poolName)[0] - print "free:", machines.get(poolName)[1].qsize(), machines.get(poolName)[1].dump() From 56c71c1fe97e3f336504951d1eb3901dd0394ba9 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 13 Oct 2017 19:04:35 +0000 Subject: [PATCH 041/131] Sort the lists of vms and pool items. --- tools/ec2Read.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 44576f72..0d6e56e7 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -22,14 +22,17 @@ def destroyInstances(): def listInstances(): vms = ec2.getVMs() - print "aws instances" - for vm in vms: + print "aws instances", len(vms) + for vm in sorted(vms, key=lambda x: x.name): print "vm", vm.name - print "list instances", len(vms) print "pools", ec2.img2ami.keys() for key in server.preallocator.machines.keys(): pool = server.preallocator.getPool(key) - print "pool", key, pool["total"], pool["free"] + totalPool = pool["total"] + freePool = pool["free"] + totalPool.sort() + freePool.sort() + print "pool", key, "total", len(totalPool), totalPool, freePool def createInstances(num): for imageName in pools: From 484f435a91911e1be878f04c3d2e529cb9f2b1bc Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 13 Oct 2017 19:10:28 +0000 Subject: [PATCH 042/131] Wait a bit after requesting an instance from aws. also allows a redis connection to be passed into tangoObjects, for testing purpose. --- tangoObjects.py | 6 +++++- vmms/ec2SSH.py | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/tangoObjects.py b/tangoObjects.py index b82a99d0..108382dd 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -10,9 +10,13 @@ redisConnection = None -def getRedisConnection(): +def getRedisConnection(connection=None): global redisConnection if redisConnection is None: + if connection: + redisConnection = connection + return redisConnection + redisConnection = redis.StrictRedis( host=Config.REDIS_HOSTNAME, port=Config.REDIS_PORT, db=0) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 9981428f..d63f4b00 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -280,6 +280,11 @@ def initializeVM(self, vm): security_groups=[ config.Config.DEFAULT_SECURITY_GROUP], instance_type=ec2instance['instance_type']) + + # Sleep for a while to prevent random transient errors observed + # when the instance is not available yet + time.sleep(config.Config.TIMER_POLL_INTERVAL) + newInstance = self.getInstanceByReservationId(reservation.id) if newInstance: # Assign name to EC2 instance From 95ad616d8e9db78ecae379e3311c8b1928be035d Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 13 Oct 2017 19:13:21 +0000 Subject: [PATCH 043/131] Add comments. --- tangoObjects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tangoObjects.py b/tangoObjects.py index 108382dd..2ac94f70 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -9,7 +9,7 @@ redisConnection = None - +# Pass in an existing connection to redis, sometimes necessary for testing. def getRedisConnection(connection=None): global redisConnection if redisConnection is None: From 04e8176f3ea99a1a5ad5ec710a2145bd7a482286 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 7 Nov 2017 01:06:16 +0000 Subject: [PATCH 044/131] Fix a problem when failed job is last finished, it's not recognized. --- tools/run_jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/run_jobs.py b/tools/run_jobs.py index f4426a36..91787310 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -172,7 +172,7 @@ print("elapsed time: %s\n" % (str(datetime.timedelta(seconds = now - startTime)))) numberRemaining = len(remainingFiles) - if numberRemaining == 0: + if numberRemaining == 0 and not justFinishedFiles: print "All output files are counted for :))" break From 2544f1cd94687d799ad1f90947e29281f07b47bf Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 13 Nov 2017 21:14:08 +0000 Subject: [PATCH 045/131] checkpoint. 1. fix a bug in finding the last submission. 2. scan submissions for exising failures. --- tools/run_jobs.py | 63 ++++++++++++++++++++++++++++++++++++++++++----- tools/util.py | 14 ++++++++--- 2 files changed, 67 insertions(+), 10 deletions(-) diff --git a/tools/run_jobs.py b/tools/run_jobs.py index 91787310..1f62173c 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -37,21 +37,72 @@ # get student handin files, the last submission for each student, # and make a map from email to useful attrbutes - + + # if the handin dir also has the output files from the past, use them + # as baseline. A crude test is to see if the number of output files is + # close to the number of handin files (within 10% difference). + nOutputFiles = len(glob.glob(lab.handinOutputFileQuery)) + nHandinFiles = len(glob.glob(lab.handinFileQuery)) + checkHandinOutput = True if abs(nOutputFiles / float(nHandinFiles) - 1.0) < 0.1 else False + for file in sorted(glob.glob(lab.handinFileQuery)): baseName = file.split("/").pop() - matchObj = re.match(r'(.*)_[0-9]+_(.*)', baseName, re.M|re.I) + matchObj = re.match(r'(.*)_([0-9]+)_(.*)', baseName, re.M|re.I) email = matchObj.group(1) + versionStr = matchObj.group(2) + version = int(versionStr) withoutSuffix = baseName.replace(lab.handinSuffix, "") outputFile = withoutSuffix + "_" + lab.name + ".txt" jobName = lab.courseLab + "_" + withoutSuffix - + + handinOutput = None + passed = None + if checkHandinOutput: + handinOutput = lab.handinDir + "/" + email + "_" + versionStr + lab.handinOutputFileSuffix + if os.path.isfile(handinOutput): + passed = True if util.outputOK(handinOutput) else False + else: + handinOutput = None + + # add newly seen student if email not in students: students.append(email) - studentFile = {"full": file, "base": baseName, "job": jobName, - "stripped": matchObj.group(2), "output": outputFile} - student2file[email] = studentFile + + # if previous output is available, only use the submission that has matching output + if checkHandinOutput: + if email not in student2file or \ + (version > student2file[email]["version"] and \ + (handinOutput and student2file[email]["existingOutput"]) or + (not handinOutput and not student2file[email]["existingOutput"])) or \ + (not student2file[email]["existingOutput"] and handinOutput): + studentFile = {"result": passed, "existingOutput": handinOutput, # previous outcome + "version": version, "full": file, "base": baseName, "job": jobName, + "stripped": matchObj.group(3), "output": outputFile} + student2file[email] = studentFile + elif email not in student2file or version > student2file[email]["version"]: + studentFile = {"version": version, "full": file, "base": baseName, "job": jobName, + "stripped": matchObj.group(3), "output": outputFile} + student2file[email] = studentFile + # end of for loop in handin files + + # report pre-existing failures and missing output files + knownFailures = [] + outcomeUnknown = [] + if checkHandinOutput: + for student in students: + if student2file[student]["result"] == None: + outcomeUnknown.append(student) + elif not student2file[student]["result"]: + knownFailures.append(student) + if knownFailures: + print "#", len(knownFailures), "known failures" + for student in knownFailures: + print student, student2file[student]["existingOutput"] + if outcomeUnknown: + print "#", len(outcomeUnknownn), "students without existing output files" + for student in outcomeUnknown: + print student # print the students and the indices if cmdLine.args.list_students: diff --git a/tools/util.py b/tools/util.py index cf67c634..015a6c3c 100644 --- a/tools/util.py +++ b/tools/util.py @@ -46,14 +46,17 @@ def __init__(self, cfg, cmdLine, labIndex): self.courseLabDir = cfg.courseRoot + "/" + self.name self.makefile = self.courseLabDir + "/" + "autograde-Makefile" self.autogradeTar = self.courseLabDir + "/" + "autograde.tar" - self.handinFileQuery = "/".join([self.courseLabDir, - "handin", - "*" + self.handinSuffix]) - self.outputDir = None + + self.handinDir = "/".join([self.courseLabDir, "handin"]) + self.handinFileQuery = "/".join([self.handinDir, "*" + self.handinSuffix]) + self.handinOutputFileQuery = "/".join([self.handinDir, "*_autograde.txt"]) + self.handinOutputFileSuffix = "_" + self.name + "_autograde.txt" + self.outputDir = "/".join([cfg.tangoFileRoot, "test-" + self.courseLab, "output"]) self.outputFileQuery = self.outputDir + "/*" + self.name + ".txt" + if cmdLine.args.handin_records: self.outputFileQuery = self.courseLabDir + "/handin/*" + self.name + "_autograde.txt" print "EXAM FAILURES from", self.outputFileQuery @@ -158,3 +161,6 @@ def getRerunList(cfg, lab): failedStudents.append(s) return failedStudents + +def outputOK(file): + return True if "\"scores\":" in open(file).read() else False From caefb9a4f616f20289d5b70ca807b99bf97d7a25 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 16 Nov 2017 01:47:06 +0000 Subject: [PATCH 046/131] report inconsistency between current results and existing. --- tools/config_for_run_jobs.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py index 2fa03a77..8ab8103e 100644 --- a/tools/config_for_run_jobs.py +++ b/tools/config_for_run_jobs.py @@ -10,31 +10,43 @@ class Config: # YOUR course name course = "your-name-experiment" + course = "czang-exp" # YOUR root dir for course/lab definitions and handin (student submissions) courseRoot = "/n/scratch/czang/f16/" - + #courseRoot = "/n/scratch/czang/f17/" + courseRoot = "/mnt/data/f16/" + courseRoot = "/tmp" + # YOUR lab definitions. The index of the lab is given to run_job.py labs = [ + {"name": "cloudfscheckpoint2dedup", "handinSuffix": ".tar", "image": "746.img"}, {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "746.img"}, - {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "newPool.img"}, - {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "newPool.img"}] + {"name": "myftlcheckpoint2", "handinSuffix": ".cpp", "image": "746.img"}, + {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "746.img"}, + {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "xyz.img"}, + {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "xyz.img"}, + {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "xyz.img"}] # Range of student submissions to run (sorted by student emails) # If either is None, all student submissions are run, unless # -r, -f, or -s is given to run_jobs. - firstStudentNum = 3 # start from index 3 (set to None for all students) - totalStudents = 1 # run one student + # firstStudentNum = 3 # start from index 3 (set to None for all students) + firstStudentNum = 0 # start from index 3 (set to None for all students) + totalStudents = 10 # run one student # YOUR Tango container's root dir for submissions and output tangoFileRoot = "/root/autolab-oneclick/server/tango_courselabs" + tangoFileRoot = "/mnt/charlene/tango_courselabs" # YOUR Tango repo root (cloned from xyzisinus' Autolab github) tangoDir = "/h/myname/Tango" + tangoDir = "/mnt/charlene/Tango" # Sometimes multiple experimental Tango containers are run on one machine. # They are identified by different ports. tangoHostPort = "host-port 8600" + tangoHostPort = "host-port 8660" # IP of the tango container is usually computed automatically tangoIP = "" @@ -43,5 +55,6 @@ class Config: # In such case a different forwarding port is assigned to it. # Note: This variable is used by ec2Read.py only. redisPort = 6379 # standard - + # redisPort = 6380 + # end of class Config From 906bfa5b3869ba49044d4216b18c12e423146279 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 16 Nov 2017 01:54:00 +0000 Subject: [PATCH 047/131] Recover the old content of the config file after a mistaken commit. --- tools/config_for_run_jobs.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py index 8ab8103e..2fa03a77 100644 --- a/tools/config_for_run_jobs.py +++ b/tools/config_for_run_jobs.py @@ -10,43 +10,31 @@ class Config: # YOUR course name course = "your-name-experiment" - course = "czang-exp" # YOUR root dir for course/lab definitions and handin (student submissions) courseRoot = "/n/scratch/czang/f16/" - #courseRoot = "/n/scratch/czang/f17/" - courseRoot = "/mnt/data/f16/" - courseRoot = "/tmp" - + # YOUR lab definitions. The index of the lab is given to run_job.py labs = [ - {"name": "cloudfscheckpoint2dedup", "handinSuffix": ".tar", "image": "746.img"}, {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "746.img"}, - {"name": "myftlcheckpoint2", "handinSuffix": ".cpp", "image": "746.img"}, - {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "746.img"}, - {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "xyz.img"}, - {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "xyz.img"}, - {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "xyz.img"}] + {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "newPool.img"}, + {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "newPool.img"}] # Range of student submissions to run (sorted by student emails) # If either is None, all student submissions are run, unless # -r, -f, or -s is given to run_jobs. - # firstStudentNum = 3 # start from index 3 (set to None for all students) - firstStudentNum = 0 # start from index 3 (set to None for all students) - totalStudents = 10 # run one student + firstStudentNum = 3 # start from index 3 (set to None for all students) + totalStudents = 1 # run one student # YOUR Tango container's root dir for submissions and output tangoFileRoot = "/root/autolab-oneclick/server/tango_courselabs" - tangoFileRoot = "/mnt/charlene/tango_courselabs" # YOUR Tango repo root (cloned from xyzisinus' Autolab github) tangoDir = "/h/myname/Tango" - tangoDir = "/mnt/charlene/Tango" # Sometimes multiple experimental Tango containers are run on one machine. # They are identified by different ports. tangoHostPort = "host-port 8600" - tangoHostPort = "host-port 8660" # IP of the tango container is usually computed automatically tangoIP = "" @@ -55,6 +43,5 @@ class Config: # In such case a different forwarding port is assigned to it. # Note: This variable is used by ec2Read.py only. redisPort = 6379 # standard - # redisPort = 6380 - + # end of class Config From 3e0cd38d5482db7e20c4dd304275295dce8d5482 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 16 Nov 2017 01:54:43 +0000 Subject: [PATCH 048/131] When a job is done, report inconsistency between existing result and new. --- tools/run_jobs.py | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/tools/run_jobs.py b/tools/run_jobs.py index 1f62173c..ce35c39a 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -100,7 +100,7 @@ for student in knownFailures: print student, student2file[student]["existingOutput"] if outcomeUnknown: - print "#", len(outcomeUnknownn), "students without existing output files" + print "#", len(outcomeUnknown), "students without existing output files" for student in outcomeUnknown: print student @@ -196,6 +196,8 @@ numberRemaining = len(remainingFiles) loopDelay = 5 badOutputFiles = [] +inconsistentResults = [] +noCompareResults = [] justFinishedFiles = [] while True and len(outputFiles) > 0: @@ -205,11 +207,22 @@ # the file may not fulled copied. So we check the files found in # the last round. for file in justFinishedFiles: - if "\"scores\":" not in open(file).read(): - badOutputFiles.append(file) - print("output missing scores: %s" % file) - else: + OK = util.outputOK(file) + if OK: print("Output ready: %s" % file) + else: + badOutputFiles.append(file) + print("Output missing scores: %s" % file) + + if checkHandinOutput: + matchObj = re.match(r'(.*)_[0-9]+_.*', os.path.basename(file), re.M|re.I) + email = matchObj.group(1) + if student2file[email]["result"] == None: + noCompareResults.append(file) + print("No existing result for comparison") + elif student2file[email]["result"] != OK: + inconsistentResults.append([student2file[email]["existingOutput"], file]) + print("Inconsistent with existing result %s" % student2file[email]["existingOutput"]) justFinishedFiles = [] for file in remainingFiles: @@ -228,13 +241,18 @@ break if badOutputFiles: - # not all bad files are really bad because the file copying may not - # be done when the error is reported, particularly if the file is long - realBadFiles = [] + print("Found %d output files without scores" % len(badOutputFiles)) for f in badOutputFiles: - if "\"scores\":" not in open(f).read(): - realBadFiles.append(f) - - print("Found %d bad output files" % len(realBadFiles)) - for f in realBadFiles: - print("bad output: %s" % f) + print("Output without scores: %s" % f) + +if inconsistentResults: + print("Found %d inconsistent results" % len(inconsistentResults)) + for r in inconsistentResults: + r0 = "with scores" if util.outputOK(r[0]) else "without scores" + r1 = "with scores" if util.outputOK(r[1]) else "without scores" + print("Existing(%s): %s new(%s): %s" % (r0, r[0], r1, r[1])) + +if noCompareResults: + print("Found %d results without existing comparision" % len(noCompareResults)) + for f in noCompareResults: + print("No comparison: %s" % f) From b4e71239a54bbabe37784dd82c6dab6191e6c104 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 17 Nov 2017 20:25:10 +0000 Subject: [PATCH 049/131] Now about to print interesting attributes of each instance. --- tools/ec2Read.py | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 0d6e56e7..fd5a5f40 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -1,4 +1,4 @@ -import os, sys, time, re +import os, sys, time, re, json, pprint sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from vmms.ec2SSH import Ec2SSH from preallocator import Preallocator @@ -9,6 +9,7 @@ import tangoObjects import config_for_run_jobs import redis +import boto3 # test vmms.ec2SSH's image extraction code, etc # also serve as a template of accessing the ec2SSH vmms @@ -20,11 +21,44 @@ def destroyInstances(): print "destroy", vm.name ec2.destroyVM(vm) +def listInstancesLong(): + nameInstances = [] + response = ec2client.describe_instances() + for reservation in response["Reservations"]: + for instance in reservation["Instances"]: + nameTag = (item for item in instance["Tags"] if item["Key"] == "Name").next() + nameInstances.append({"Name": nameTag["Value"] if nameTag else "None", + "Instance": instance}) + + print len(nameInstances), "instances:" + for item in sorted(nameInstances, key=lambda x: x["Name"]): + # pp = pprint.PrettyPrinter(indent=2) + # pp.pprint(instance) + instance = item["Instance"] + print("%s: %s %s %s" % + (item["Name"], instance["InstanceId"], instance["PublicIpAddress"], instance["LaunchTime"])) + for tag in instance["Tags"]: + print("\t tag {%s: %s}" % (tag["Key"], tag["Value"])) + + """ useful sometimes + print "ImageId:", instance["ImageId"] + print "PublicDnsName:", instance["PublicDnsName"] + print "InstanceType:", instance["InstanceType"] + print "State:", instance["State"]["Name"] + print "SecurityGroups:", instance["SecurityGroups"] + image = ec2resource.Image(instance["ImageId"]) + print "Image:", image.image_id + for tag in image.tags: + print("\t tag {%s: %s}" % (tag["Key"], tag["Value"])) + """ + def listInstances(): + """ vms = ec2.getVMs() print "aws instances", len(vms) for vm in sorted(vms, key=lambda x: x.name): - print "vm", vm.name + print "vm", vm.name, vm.ec2_id + """ print "pools", ec2.img2ami.keys() for key in server.preallocator.machines.keys(): pool = server.preallocator.getPool(key) @@ -71,11 +105,14 @@ def allocateVMs(): redisConnection = redis.StrictRedis( host=Config.REDIS_HOSTNAME, port=config_for_run_jobs.Config.redisPort, db=0) tangoObjects.getRedisConnection(connection=redisConnection) +ec2client = boto3.client("ec2", Config.EC2_REGION) +ec2resource = boto3.resource("ec2", Config.EC2_REGION) server = TangoServer() ec2 = server.preallocator.vmms["ec2SSH"] pools = ec2.img2ami +listInstancesLong() listInstances() exit() destroyInstances() From 9f188908330464827958b548bb922761c281cd18 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 17 Nov 2017 21:12:34 +0000 Subject: [PATCH 050/131] Only look at running instances. --- tools/ec2Read.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index fd5a5f40..4f0c0f93 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -26,6 +26,8 @@ def listInstancesLong(): response = ec2client.describe_instances() for reservation in response["Reservations"]: for instance in reservation["Instances"]: + if instance["State"]["Name"] != "running": + continue nameTag = (item for item in instance["Tags"] if item["Key"] == "Name").next() nameInstances.append({"Name": nameTag["Value"] if nameTag else "None", "Instance": instance}) From 7580354530edb20ef629fbf4df1733e889468d3b Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Sat, 18 Nov 2017 15:44:10 +0000 Subject: [PATCH 051/131] print instance launch time in local zone. --- tools/ec2Read.py | 54 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 4f0c0f93..ba5a94c4 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -1,4 +1,4 @@ -import os, sys, time, re, json, pprint +import os, sys, time, re, json, pprint, datetime sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from vmms.ec2SSH import Ec2SSH from preallocator import Preallocator @@ -10,10 +10,17 @@ import config_for_run_jobs import redis import boto3 +import pytz +import tzlocal # test vmms.ec2SSH's image extraction code, etc # also serve as a template of accessing the ec2SSH vmms +local_tz = pytz.timezone("EST") +def utc_to_local(utc_dt): + local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) + return local_tz.normalize(local_dt) + def destroyInstances(): vms = ec2.getVMs() for vm in vms: @@ -21,26 +28,51 @@ def destroyInstances(): print "destroy", vm.name ec2.destroyVM(vm) +local_tz = pytz.timezone("EST") + +def utc_to_local(utc_dt): + local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) + return local_tz.normalize(local_dt) + +def changeTags(instanceId): + instance = boto3resource.Instance(instanceId) + print instance.tags + nameTag = (item for item in instance.tags if item["Key"] == "Name").next() + if nameTag: + tagCopy = [nameTag["Key"], nameTag["Value"]] + print tagCopy + # instance.delete_tags(Tags=[intance.tags[0]]) + print boto3resource.Instance(instanceId).tags + def listInstancesLong(): nameInstances = [] - response = ec2client.describe_instances() + response = boto3connection.describe_instances() for reservation in response["Reservations"]: for instance in reservation["Instances"]: if instance["State"]["Name"] != "running": continue - nameTag = (item for item in instance["Tags"] if item["Key"] == "Name").next() - nameInstances.append({"Name": nameTag["Value"] if nameTag else "None", - "Instance": instance}) + if "Tags" in instance: + nameTag = (item for item in instance["Tags"] if item["Key"] == "Name").next() + nameInstances.append({"Name": nameTag["Value"] if nameTag else "None", + "Instance": instance}) + else: + nameInstances.append({"Name": "None", "Instance": instance}) + + # changeTags(nameInstances[-1]["Instance"]["InstanceId"]) print len(nameInstances), "instances:" for item in sorted(nameInstances, key=lambda x: x["Name"]): # pp = pprint.PrettyPrinter(indent=2) # pp.pprint(instance) instance = item["Instance"] + launchTime = utc_to_local(instance["LaunchTime"]) print("%s: %s %s %s" % - (item["Name"], instance["InstanceId"], instance["PublicIpAddress"], instance["LaunchTime"])) - for tag in instance["Tags"]: - print("\t tag {%s: %s}" % (tag["Key"], tag["Value"])) + (item["Name"], instance["InstanceId"], instance["PublicIpAddress"], launchTime)) + if "Tags" in instance: + for tag in instance["Tags"]: + print("\t tag {%s: %s}" % (tag["Key"], tag["Value"])) + else: + print("\t No tags") """ useful sometimes print "ImageId:", instance["ImageId"] @@ -48,7 +80,7 @@ def listInstancesLong(): print "InstanceType:", instance["InstanceType"] print "State:", instance["State"]["Name"] print "SecurityGroups:", instance["SecurityGroups"] - image = ec2resource.Image(instance["ImageId"]) + image = boto3resource.Image(instance["ImageId"]) print "Image:", image.image_id for tag in image.tags: print("\t tag {%s: %s}" % (tag["Key"], tag["Value"])) @@ -107,8 +139,8 @@ def allocateVMs(): redisConnection = redis.StrictRedis( host=Config.REDIS_HOSTNAME, port=config_for_run_jobs.Config.redisPort, db=0) tangoObjects.getRedisConnection(connection=redisConnection) -ec2client = boto3.client("ec2", Config.EC2_REGION) -ec2resource = boto3.resource("ec2", Config.EC2_REGION) +boto3connection = boto3.client("ec2", Config.EC2_REGION) +boto3resource = boto3.resource("ec2", Config.EC2_REGION) server = TangoServer() ec2 = server.preallocator.vmms["ec2SSH"] From 27fe73babbcbdb7d63270a6a7e6700add0c5a49d Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Sat, 18 Nov 2017 19:05:40 +0000 Subject: [PATCH 052/131] Save the code of tag changing. Its mission is finished now the feature of keeping vm after failure test has been implemented. --- tools/ec2Read.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index ba5a94c4..07da4c72 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -34,15 +34,16 @@ def utc_to_local(utc_dt): local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) return local_tz.normalize(local_dt) -def changeTags(instanceId): +# test changing tags to keep the vm after test failure +def changeTags(instanceId, name, notes): + return + print "change tags for", instanceId instance = boto3resource.Instance(instanceId) - print instance.tags - nameTag = (item for item in instance.tags if item["Key"] == "Name").next() - if nameTag: - tagCopy = [nameTag["Key"], nameTag["Value"]] - print tagCopy - # instance.delete_tags(Tags=[intance.tags[0]]) - print boto3resource.Instance(instanceId).tags + tag = boto3resource.Tag(instanceId, "Name", name) + if tag: + tag.delete() + instance.create_tags(Tags=[{"Key": "Name", "Value": "failed-" + name}]) + instance.create_tags(Tags=[{"Key": "Notes", "Value": notes}]) def listInstancesLong(): nameInstances = [] @@ -58,7 +59,9 @@ def listInstancesLong(): else: nameInstances.append({"Name": "None", "Instance": instance}) - # changeTags(nameInstances[-1]["Instance"]["InstanceId"]) + sortedInstances = sorted(nameInstances, key=lambda x: x["Name"]) + changeTags(sortedInstances[-1]["Instance"]["InstanceId"], + sortedInstances[-1]["Name"], "test-name-xxx") print len(nameInstances), "instances:" for item in sorted(nameInstances, key=lambda x: x["Name"]): From 14251e99a9f3d9d8f4ba3163fa7069ec3ff327dc Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Sat, 18 Nov 2017 19:10:14 +0000 Subject: [PATCH 053/131] Add ability to KEEP_VM_AFTER_FAILURE. --- tangoObjects.py | 4 ++++ vmms/ec2SSH.py | 20 +++++++++++++++++++- worker.py | 2 ++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tangoObjects.py b/tangoObjects.py index 2ac94f70..7d966c86 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -61,6 +61,10 @@ def __init__(self, name="DefaultTestVM", image=None, vmms=None, self.resume = resume self.id = id self.instance_id = id + # The following attributes can instruct vmms to set the test machine + # aside for further investigation. + self.doNotDestroy = False + self.notes = None def __repr__(self): return "TangoMachine(image: %s, vmms: %s, id: %s)" % (self.image, self.vmms, self.id) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index d63f4b00..d45bedbd 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -113,6 +113,9 @@ def __init__(self, accessKeyId=None, accessKey=None): self.connection = ec2.connect_to_region(config.Config.EC2_REGION) self.useDefaultKeyPair = True + self.boto3connection = boto3.client("ec2", config.Config.EC2_REGION) + self.boto3resource = boto3.resource("ec2", config.Config.EC2_REGION) + # Use boto3 to read images. Find the "Name" tag and use it as key to # build a map from "Name tag" to boto3's image structure. # The code is currently using boto 2 for most of the work and we don't @@ -447,6 +450,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): maxOutputFileSize) ret = timeout(["ssh"] + self.ssh_flags + ["%s@%s" % (config.Config.EC2_USER_NAME, domain_name), runcmd], runTimeout * 2) + # return 3 # xxx inject error to test KEEP_VM_AFTER_FAILURE return ret # runTimeout * 2 is a temporary hack. The driver will handle the timout @@ -502,7 +506,21 @@ def destroyVM(self, vm): self.log.info("destroyVM: instance non-exist %s %s" % (vm.ec2_id, vm.name)) return [] - self.log.info("destroyVM: %s %s" % (vm.ec2_id, vm.name)) + self.log.info("destroyVM: %s %s %s %s" % (vm.ec2_id, vm.name, vm.doNotDestroy, vm.notes)) + + if hasattr(config.Config, 'KEEP_VM_AFTER_FAILURE') and \ + config.Config.KEEP_VM_AFTER_FAILURE and vm.doNotDestroy: + iName = self.instanceName(vm.id, vm.name) + self.log.info("Will keep VM %s for further debugging" % iName) + instance = self.boto3resource.Instance(vm.ec2_id) + # delete original name tag and replace it with "failed-xxx" + # add notes tag for test name + tag = self.boto3resource.Tag(vm.ec2_id, "Name", iName) + if tag: + tag.delete() + instance.create_tags(Tags=[{"Key": "Name", "Value": "failed-" + iName}]) + instance.create_tags(Tags=[{"Key": "Notes", "Value": vm.notes}]) + return ret = self.connection.terminate_instances(instance_ids=[vm.ec2_id]) # delete dynamically created key diff --git a/worker.py b/worker.py index f58e378c..8684aa75 100644 --- a/worker.py +++ b/worker.py @@ -312,6 +312,8 @@ def run(self): # the VM. msg = "Error: OS error while running job on VM" (returnVM, replaceVM) = (False, True) + self.job.vm.doNotDestroy = True + self.job.vm.notes = str(self.job.id) + "_" + self.job.name else: # This should never happen msg = "Error: Unknown autodriver error (status=%d)" % ( ret["runjob"]) From 1b1ef289d29aaa8a6e4673b0244749adfe72c48c Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 27 Nov 2017 16:51:36 +0000 Subject: [PATCH 054/131] disable a testing statement in the ec2 tool. --- tools/ec2Read.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 07da4c72..b80214c8 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -60,8 +60,8 @@ def listInstancesLong(): nameInstances.append({"Name": "None", "Instance": instance}) sortedInstances = sorted(nameInstances, key=lambda x: x["Name"]) - changeTags(sortedInstances[-1]["Instance"]["InstanceId"], - sortedInstances[-1]["Name"], "test-name-xxx") + # changeTags(sortedInstances[-1]["Instance"]["InstanceId"], + # sortedInstances[-1]["Name"], "test-name-xxx") print len(nameInstances), "instances:" for item in sorted(nameInstances, key=lambda x: x["Name"]): From ebafb72fe1305b03beb4af5810bbb7845bcabfe4 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 29 Nov 2017 17:56:04 +0000 Subject: [PATCH 055/131] Move timeout report to the right place. add duration reporting. --- autodriver/autodriver.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/autodriver/autodriver.c b/autodriver/autodriver.c index e4f05a60..1f8fb7e1 100644 --- a/autodriver/autodriver.c +++ b/autodriver/autodriver.c @@ -84,6 +84,8 @@ struct arguments { char *directory; } args; +unsigned long startTime = 0; + /** * @brief Parses a string into an unsigned integer. * @@ -425,6 +427,7 @@ static int monitor_child(pid_t child) { if (sigtimedwait(&sigset, NULL, &timeout) < 0) { // Child timed out + printf(OUTPUT_HEADER "Job timed out after %d seconds\n", args.timeout); assert(errno == EAGAIN); kill(child, SIGKILL); killed = 1; @@ -436,9 +439,9 @@ static int monitor_child(pid_t child) { exit(EXIT_OSERROR); } - if (killed) { - printf(OUTPUT_HEADER "Job timed out after %d seconds\n", args.timeout); - } else { + printf(OUTPUT_HEADER "Duration of test is %lu seconds\n", time(NULL) - startTime); + + if (!killed) { printf(OUTPUT_HEADER "Job exited with status %d\n", WEXITSTATUS(status)); } @@ -503,7 +506,7 @@ static void run_job(void) { int fd; if ((fd = open(OUTPUT_FILE, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) < 0) { - ERROR_ERRNO("Error opening output file"); + ERROR_ERRNO("Error creating output file"); exit(EXIT_OSERROR); } @@ -540,6 +543,7 @@ int main(int argc, char **argv) { args.fsize = 0; args.timeout = 0; args.osize = 0; + startTime = time(NULL); // Make sure this isn't being run as root if (getuid() == 0) { From 39c02fc978890e096ae870a041640fbae424ae38 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 4 Dec 2017 22:26:01 +0000 Subject: [PATCH 056/131] temporarily commmit experimental file before moving its useful parts into autodriver. --- autodriver/try.c | 183 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 autodriver/try.c diff --git a/autodriver/try.c b/autodriver/try.c new file mode 100644 index 00000000..7a573180 --- /dev/null +++ b/autodriver/try.c @@ -0,0 +1,183 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int filedes[2]; + +void childFunc() { + int i; + char j; + char buffer[10000]; + for (i = 0; i < 1000; i++) { + for (j = '0'; j <= '9'; j++) { + buffer[ i * 10 + (j - '0') ] = j; + } + } + buffer[9999] = '\0'; + printf("%s", buffer); + + /* + for (i = 0; i < 5; i++) { + system("date"); + fprintf(stdout, "stdout in child\n"); + int eol = rand() % 2; // boolean + if (eol) { + fprintf(stderr, "stderr with eol in child\n"); + } else { + fprintf(stderr, "stderr without eol in child"); + } + //sleep(3); + } + */ + + _exit(1); +} + +void* readFunc() { + char buffer[70]; + + int timestampAfterNextRead = 1; // boolean + while (1) { + memset(&buffer[0], 0, sizeof(buffer)); + ssize_t count = read(filedes[0], buffer, sizeof(buffer) - 1); + + + if (count == -1) { + if (errno == EINTR) { + continue; + } else { + perror("read"); + exit(1); + } + } else if (count == 0) { + fprintf(stderr, "exit \n"); + break; + } else { + // int insertNull = rand() % 2; // boolean + // int insertIndex = rand() % count; + int processedCount = 0; + int addTimestamp = timestampAfterNextRead; + + fprintf(stderr, "\n====================================\n"); + fprintf(stderr, "### read %lu bytes: \"%s\"\n", count, buffer); + timestampAfterNextRead = (buffer[count - 1] == '\n'); // boolean + + /* + fprintf(stderr, "### random insert index %d %d, \"%s\", \"%s\"\n", + insertNull, insertIndex, buffer, &buffer[insertIndex]); + if (insertNull) { + buffer[insertIndex] = '\0'; + } + */ + + char *result = strtok(buffer, "\n"); + while (1) { + if (!result) { + if (processedCount < count) { // must have seen a NULL + if (!buffer[processedCount]) { + processedCount++; + } + addTimestamp = 1; // null is dealt like \n + result = strtok(&buffer[processedCount], "\n"); + fprintf(stderr, "### processed after null \"%s\"\n", &buffer[processedCount]); + continue; + } + break; + } + + fprintf(stderr, "### result \"%s\" %lu, \"%s\"\n", result, strlen(result), + &buffer[processedCount]); + processedCount += strlen(result) + 1; + char *eol = (processedCount >= count && !timestampAfterNextRead) ? "" : "\n"; + fprintf(stderr, (eol[0] == '\n') ? "Add eol\n" : "Not add eol\n"); + + assert(processedCount <= count + 1); + + time_t ltime = time(NULL); + struct tm* tmInfo = localtime(<ime); + char timeStr[100]; + if (addTimestamp) { + strftime(timeStr, 100, "%Y%m%d-%H:%M:%S", tmInfo); + printf("%s: \"%s\"%s", timeStr, result, eol); + } else { + printf("\"%s\"%s", result, eol); + } + + addTimestamp = 1; + result = strtok(NULL, "\n"); + } + + addTimestamp = (buffer[count - 1] == '\n'); // boolean + } + } + return NULL; +} + +int main() { + putenv("TZ=America/New_York"); + tzset(); + + if (pipe(filedes) == -1) { + perror("pipe"); + exit(1); + } + + pid_t pid = fork(); + if (pid == -1) { + perror("fork"); + exit(1); + } else if (pid == 0) { + setvbuf(stdout, NULL, _IONBF, 0); + while ((dup2(filedes[1], STDOUT_FILENO) == -1) && (errno == EINTR)) {} + while ((dup2(filedes[1], STDERR_FILENO) == -1) && (errno == EINTR)) {} + close(filedes[1]); + close(filedes[0]); + childFunc(); + } + + // parent process comes here + pthread_t readThread; + + /* create a second thread which executes inc_x(&x) */ + if(pthread_create(&readThread, NULL, readFunc, NULL)) { + perror("create thread"); + exit(1); + } + + /* + result = waitpid(pid, &status, WNOHANG); + if (result == 0) { + printf("child done\n"); + break; + } else if (result < 0) { + perror("waitpid"); + exit(1); + } + printf("wait pid %d\n", result); + */ + + { + int status; + wait(&status); + + sleep(10); + + status = pthread_cancel(readThread); + close(filedes[0]); + } +} From c1de4c09d5b04c7fa201eeb824a369503775b4d4 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 12 Dec 2017 15:02:28 -0500 Subject: [PATCH 057/131] Add config for autodriver logging. --- config.template.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/config.template.py b/config.template.py index b508031e..84124bce 100644 --- a/config.template.py +++ b/config.template.py @@ -69,6 +69,10 @@ class Config: RUNJOB_TIMEOUT = 60 COPYOUT_TIMEOUT = 30 + # time zone and timestamp report interval for autodriver execution + AUTODRIVER_LOGGING_TIME_ZONE = "UTC" # e.g. "America/New_York". NULL => UTC + AUTODRIVER_TIMESTAMP_INTERVAL = 0 # in seconds. 0 => no timestamp insersion + # Docker constants BOOT2DOCKER_INIT_TIMEOUT = 5 BOOT2DOCKER_START_TIMEOUT = 30 @@ -111,8 +115,8 @@ class Config: # vm pool reserve size. If set, free pool size is maintained at the level. POOL_SIZE_LOW_WATER_MARK = 5 # optional, can be None - # Default increment step when enlarging vm pool - POOL_ALLOC_INCREMENT = 2 # can be None, which is treated as 1 + # Increment step when enlarging vm pool + POOL_ALLOC_INCREMENT = 2 # can be None, which is treated as 1, the default # Optionally log finer-grained timing information LOG_TIMING = False From a715788f53b44c7b4e39f635ecb9cdd53df4aef2 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 12 Dec 2017 15:35:10 -0500 Subject: [PATCH 058/131] checkpoint before adding timestamp insersion into user output. --- autodriver/autodriver.c | 79 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 6 deletions(-) diff --git a/autodriver/autodriver.c b/autodriver/autodriver.c index 1f8fb7e1..a24c4b35 100644 --- a/autodriver/autodriver.c +++ b/autodriver/autodriver.c @@ -36,16 +36,38 @@ #include #include #include +#include #define min(x, y) ((x) < (y) ? (x) : (y)) +char timestampStr[100]; +char * getTimestamp(void) { + time_t ltime = time(NULL); + struct tm* tmInfo = localtime(<ime); + strftime(timestampStr, 100, "%Y%m%d-%H:%M:%S", tmInfo); + return timestampStr; // return global variable for conveniece +} + +void printError(char *msg, int line, int dumpErrno) { + if (dumpErrno) { + printf("Autodriver@%s: ERROR %s at line %d: %s\n", getTimestamp(), msg, line, + strerror(errno)); + } else { + printf("Autodriver@%s: ERROR %s at line %d\n", getTimestamp(), msg, line); + } +} + #define OUTPUT_HEADER "Autodriver: " -#define ERROR_ERRNO(msg) \ - printf(OUTPUT_HEADER "%s at line %d: %s\n", msg, __LINE__, strerror(errno)) +#define ERROR_ERRNO(msg) printError(msg, __LINE__, 1) + +#define ERROR(msg) printError(msg, __LINE__, 0) -#define ERROR(msg) \ - printf(OUTPUT_HEADER "%s at line %d\n", msg, __LINE__) +#define MESSAGE(format, ...) \ + printf("Autodriver@%s: " format "\n", getTimestamp(), ##__VA_ARGS__) + +#define TIMESTAMP() \ + printf("Autodriver@%s: Time stamp inserted by audodriver\n", getTimestamp()); #define EXIT__BASE 1 @@ -82,6 +104,8 @@ struct arguments { struct passwd user_info; char *passwd_buf; char *directory; + char *timezone; + unsigned timestamp_interval; } args; unsigned long startTime = 0; @@ -235,6 +259,15 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) { "The argument to osize must be a nonnegative integer"); } break; + case 'i': + if (parse_uint(arg, &arguments->timestamp_interval) < 0) { + argp_failure(state, EXIT_USAGE, 0, + "The argument to timestamp-interval must be a nonnegative integer"); + } + break; + case 'z': + args.timezone = arg; + break; case ARGP_KEY_ARG: switch (state->arg_num) { case 0: @@ -322,6 +355,8 @@ static void dump_output(void) { exit(EXIT_OSERROR); } + // xxx insert time stamps. + struct stat stat; if (fstat(outfd, &stat) < 0) { ERROR_ERRNO("Error stating output file"); @@ -338,6 +373,9 @@ static void dump_output(void) { if (dump_file(outfd, part_size, stat.st_size - part_size) < 0) { exit(EXIT_OSERROR); } + + // xxx message indicating file has been truncated + } else { if (dump_file(outfd, stat.st_size, 0) < 0) { exit(EXIT_OSERROR); @@ -401,6 +439,8 @@ static void cleanup(void) { } } +// xxx add thread function for time stamp recording + /** * @brief Monitors the progression of the child * @@ -434,6 +474,8 @@ static int monitor_child(pid_t child) { } } + // xxx create a thread for time stamp recording + if (waitpid(child, &status, 0) < 0) { ERROR_ERRNO("Error reaping child"); exit(EXIT_OSERROR); @@ -504,7 +546,7 @@ static void run_job(void) { // Redirect output int fd; - if ((fd = open(OUTPUT_FILE, O_WRONLY | O_CREAT | O_TRUNC, + if ((fd = open(OUTPUT_FILE, O_WRONLY | O_CREAT | O_TRUNC | O_SYNC, // no buffering S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) < 0) { ERROR_ERRNO("Error creating output file"); exit(EXIT_OSERROR); @@ -525,11 +567,13 @@ static void run_job(void) { exit(EXIT_OSERROR); } + /* xxx this should be open // Switch into the folder if (chdir(args.directory) < 0) { ERROR_ERRNO("Error changing directory"); exit(EXIT_OSERROR); } + */ // Finally exec job execl("/usr/bin/make", "make", NULL); @@ -543,6 +587,8 @@ int main(int argc, char **argv) { args.fsize = 0; args.timeout = 0; args.osize = 0; + args.timestamp_interval = 30; + args.timezone = NULL; startTime = time(NULL); // Make sure this isn't being run as root @@ -570,6 +616,10 @@ int main(int argc, char **argv) { "Limit the amount of time a job is allowed to run (seconds)", 0}, {"osize", 'o', "size", 0, "Limit the amount of output returned (bytes)", 0}, + {"timestamp-interval", 'i', "interval", 0, + "Interval (seconds) for placing timestamps in user output file", 0}, + {"timezone", 'z', "timezone", 0, + "Timezone setting. Default is UTC", 0}, {0, 0, 0, 0, 0, 0} }; @@ -578,7 +628,24 @@ int main(int argc, char **argv) { argp_parse(&parser, argc, argv, 0, NULL, &args); - setup_dir(); + // set time zone preference: -z argument, TZ environment variable, system wide + if (args.timezone) { + char tz[100]; + strcpy(tz, "TZ="); + strcat(tz, args.timezone); + putenv(tz); + } + tzset(); + MESSAGE("Time zone %s:%s", tzname[0], tzname[1]); + + /* + ERROR("test test"); + MESSAGE("%s %d interval %d", "abc", 123, args.timestamp_interval); + TIMESTAMP(); + exit(1); + */ + + // setup_dir(); // xxx should be open // Block SIGCHLD to make sure monitor_child recieves it. sigset_t sigset; From fb0d78bd1e8e4f715504fb4c6c11c6dad981113c Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 18 Dec 2017 12:53:18 -0500 Subject: [PATCH 059/131] checkpoint. Add timekeeping thread. change printing macros. --- autodriver/autodriver.c | 105 ++++++++++++++++++++++++++++------------ 1 file changed, 75 insertions(+), 30 deletions(-) diff --git a/autodriver/autodriver.c b/autodriver/autodriver.c index a24c4b35..88e5a432 100644 --- a/autodriver/autodriver.c +++ b/autodriver/autodriver.c @@ -37,37 +37,31 @@ #include #include #include +#include #define min(x, y) ((x) < (y) ? (x) : (y)) char timestampStr[100]; -char * getTimestamp(void) { - time_t ltime = time(NULL); +char * getTimestamp(time_t t) { + time_t ltime = t ? t : time(NULL); struct tm* tmInfo = localtime(<ime); strftime(timestampStr, 100, "%Y%m%d-%H:%M:%S", tmInfo); return timestampStr; // return global variable for conveniece } -void printError(char *msg, int line, int dumpErrno) { - if (dumpErrno) { - printf("Autodriver@%s: ERROR %s at line %d: %s\n", getTimestamp(), msg, line, - strerror(errno)); - } else { - printf("Autodriver@%s: ERROR %s at line %d\n", getTimestamp(), msg, line); - } -} - -#define OUTPUT_HEADER "Autodriver: " +#define ERROR_ERRNO(format, ...) \ + printf("Autodriver@%s: ERROR " format " at line %d: %s\n", \ + getTimestamp(0), ##__VA_ARGS__, __LINE__, strerror(errno)) -#define ERROR_ERRNO(msg) printError(msg, __LINE__, 1) - -#define ERROR(msg) printError(msg, __LINE__, 0) +#define ERROR(format, ...) \ + printf("Autodriver@%s: ERROR " format " at line %d\n", \ + getTimestamp(0), ##__VA_ARGS__, __LINE__) #define MESSAGE(format, ...) \ - printf("Autodriver@%s: " format "\n", getTimestamp(), ##__VA_ARGS__) + printf("Autodriver@%s: " format "\n", getTimestamp(0), ##__VA_ARGS__) #define TIMESTAMP() \ - printf("Autodriver@%s: Time stamp inserted by audodriver\n", getTimestamp()); + printf("Autodriver@%s: Time stamp inserted by autodriver\n", getTimestamp(0)); #define EXIT__BASE 1 @@ -110,6 +104,14 @@ struct arguments { unsigned long startTime = 0; +typedef struct { + time_t time; + size_t cursor; +} timestamp_map_t; + +// #define TIMESTAMP_MAP_CHUNK_SIZE 1024 xxx put it back +#define TIMESTAMP_MAP_CHUNK_SIZE 10 + /** * @brief Parses a string into an unsigned integer. * @@ -369,7 +371,7 @@ static void dump_output(void) { if (dump_file(outfd, part_size, 0) < 0) { exit(EXIT_OSERROR); } - printf("\n...[excess bytes elided]...\n"); + MESSAGE("\n...[excess bytes elided by autodriver]...\n"); if (dump_file(outfd, part_size, stat.st_size - part_size) < 0) { exit(EXIT_OSERROR); } @@ -439,7 +441,46 @@ static void cleanup(void) { } } -// xxx add thread function for time stamp recording +// pthread function, keep a map of timestamp and user's output file size +timestamp_map_t *timestampMap = NULL; +unsigned timestampCount = 0; +void *timestampFunc() { + sleep(10); // wait a bit for the output file to be created by child process + while (1) { + if (timestampCount % TIMESTAMP_MAP_CHUNK_SIZE == 0) { + timestamp_map_t *newBuffer = + realloc(timestampMap, + sizeof(timestamp_map_t) * (TIMESTAMP_MAP_CHUNK_SIZE + timestampCount)); + // printf("allocate %d items\n", (TIMESTAMP_MAP_CHUNK_SIZE + timestampCount)); + if (!newBuffer){ + ERROR_ERRNO("Failed to allocate timestamp map. Current map size %d", + timestampCount); + exit(EXIT_OSERROR); + } + timestampMap = newBuffer; + newBuffer += timestampCount; + memset(newBuffer, 0, sizeof(timestamp_map_t) * TIMESTAMP_MAP_CHUNK_SIZE); + } + + int outfd; + if ((outfd = open(OUTPUT_FILE, O_RDONLY)) < 0) { + ERROR_ERRNO("Error opening output file"); + exit(EXIT_OSERROR); + } + struct stat buf; + fstat(outfd, &buf); + timestampMap[timestampCount].time = time(NULL); + timestampMap[timestampCount].cursor = buf.st_size; + timestampCount++; + if (close(outfd) < 0) { + ERROR_ERRNO("Error closing output file"); + exit(EXIT_OSERROR); + } + sleep(args.timestamp_interval); + } + + return NULL; +} /** * @brief Monitors the progression of the child @@ -455,6 +496,15 @@ static int monitor_child(pid_t child) { int killed = 0; int status; + // create a thread for for file size tracking by time interval + pthread_t timestampThread = 0; // this thread needs no cancellation + if (args.timestamp_interval) { + if (pthread_create(×tampThread, NULL, timestampFunc, NULL)) { + ERROR_ERRNO("Failed to create timestamp thread"); + exit(EXIT_OSERROR); + } + } + // Handle the timeout if we have to if (args.timeout != 0) { struct timespec timeout; @@ -467,25 +517,22 @@ static int monitor_child(pid_t child) { if (sigtimedwait(&sigset, NULL, &timeout) < 0) { // Child timed out - printf(OUTPUT_HEADER "Job timed out after %d seconds\n", args.timeout); + ERROR_ERRNO("Job timed out after %d seconds\n", args.timeout); assert(errno == EAGAIN); kill(child, SIGKILL); killed = 1; } } - // xxx create a thread for time stamp recording - if (waitpid(child, &status, 0) < 0) { ERROR_ERRNO("Error reaping child"); exit(EXIT_OSERROR); } - printf(OUTPUT_HEADER "Duration of test is %lu seconds\n", time(NULL) - startTime); + MESSAGE("Duration of test is %lu seconds", time(NULL) - startTime); if (!killed) { - printf(OUTPUT_HEADER "Job exited with status %d\n", - WEXITSTATUS(status)); + MESSAGE("Job exited with status %d", WEXITSTATUS(status)); } dump_output(); @@ -567,13 +614,11 @@ static void run_job(void) { exit(EXIT_OSERROR); } - /* xxx this should be open // Switch into the folder if (chdir(args.directory) < 0) { ERROR_ERRNO("Error changing directory"); exit(EXIT_OSERROR); } - */ // Finally exec job execl("/usr/bin/make", "make", NULL); @@ -587,13 +632,13 @@ int main(int argc, char **argv) { args.fsize = 0; args.timeout = 0; args.osize = 0; - args.timestamp_interval = 30; + args.timestamp_interval = 1; // xxx change back to 30 args.timezone = NULL; startTime = time(NULL); // Make sure this isn't being run as root if (getuid() == 0) { - printf(OUTPUT_HEADER "Autodriver should not be run as root.\n"); + ERROR("Autodriver should not be run as root"); exit(EXIT_USAGE); } @@ -645,7 +690,7 @@ int main(int argc, char **argv) { exit(1); */ - // setup_dir(); // xxx should be open + setup_dir(); // Block SIGCHLD to make sure monitor_child recieves it. sigset_t sigset; From 364ae409c2485242677b79d9ded0234288b7ad53 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 22 Dec 2017 16:03:00 -0500 Subject: [PATCH 060/131] checkpoint. code complete but need to move file creation to parent proc. --- autodriver/autodriver.c | 217 ++++++++++++++++++++++++++-------------- 1 file changed, 142 insertions(+), 75 deletions(-) diff --git a/autodriver/autodriver.c b/autodriver/autodriver.c index 88e5a432..ea80eaa2 100644 --- a/autodriver/autodriver.c +++ b/autodriver/autodriver.c @@ -60,9 +60,6 @@ char * getTimestamp(time_t t) { #define MESSAGE(format, ...) \ printf("Autodriver@%s: " format "\n", getTimestamp(0), ##__VA_ARGS__) -#define TIMESTAMP() \ - printf("Autodriver@%s: Time stamp inserted by autodriver\n", getTimestamp(0)); - #define EXIT__BASE 1 /* Exit codes for use after errors */ @@ -106,12 +103,21 @@ unsigned long startTime = 0; typedef struct { time_t time; - size_t cursor; + size_t offset; } timestamp_map_t; // #define TIMESTAMP_MAP_CHUNK_SIZE 1024 xxx put it back #define TIMESTAMP_MAP_CHUNK_SIZE 10 +timestamp_map_t *timestampMap = NULL; +unsigned timestampCount = 0; +unsigned currentStamp = 0; + +size_t outputFileSize = 0; + +int outputTruncated = 0; +int timestampInserted = 0; + /** * @brief Parses a string into an unsigned integer. * @@ -182,6 +188,66 @@ static int parse_user(char *name, struct passwd *user_info, char **buf) { return 0; } +// pthread function, keep a map of timestamp and user's output file size +void *timestampFunc() { + sleep(10); // wait a bit for the output file to be created by child process + while (1) { + if (timestampCount % TIMESTAMP_MAP_CHUNK_SIZE == 0) { + timestamp_map_t *newBuffer = + realloc(timestampMap, + sizeof(timestamp_map_t) * (TIMESTAMP_MAP_CHUNK_SIZE + timestampCount)); + // printf("allocate %d items\n", (TIMESTAMP_MAP_CHUNK_SIZE + timestampCount)); + if (!newBuffer){ + ERROR_ERRNO("Failed to allocate timestamp map. Current map size %d", + timestampCount); + exit(EXIT_OSERROR); + } + timestampMap = newBuffer; + newBuffer += timestampCount; + memset(newBuffer, 0, sizeof(timestamp_map_t) * TIMESTAMP_MAP_CHUNK_SIZE); + } + + int outfd; + if ((outfd = open(OUTPUT_FILE, O_RDONLY)) < 0) { + ERROR_ERRNO("Error opening output file"); + exit(EXIT_OSERROR); + } + struct stat buf; + fstat(outfd, &buf); + timestampMap[timestampCount].time = time(NULL); + timestampMap[timestampCount].offset = buf.st_size; + timestampCount++; + if (close(outfd) < 0) { + ERROR_ERRNO("Error closing output file"); + exit(EXIT_OSERROR); + } + sleep(args.timestamp_interval); + } + + return NULL; +} + +int writeBuffer(char *buffer, size_t nBytes) { + ssize_t nwritten; + size_t write_rem; + char *write_base; + + write_rem = nBytes; + write_base = buffer; + while (write_rem > 0) { + if ((nwritten = write(STDOUT_FILENO, write_base, write_rem)) < 0) { + ERROR_ERRNO("Error writing output"); + return -1; + } + write_rem -= nwritten; + write_base += nwritten; + } + return 0; +} + +#define WRITE_BUFFER(buffer, nBytes) \ + if (writeBuffer(buffer, nBytes)) return -1 + /** * @brief Dumps a specified number of bytes from a file to standard out * @@ -193,9 +259,12 @@ static int parse_user(char *name, struct passwd *user_info, char **buf) { */ static int dump_file(int fd, size_t bytes, off_t offset) { char buffer[BUFSIZE]; - char *write_base; - ssize_t nread, nwritten; - size_t read_rem, write_rem; + size_t read_rem = bytes; + size_t currentOffset = offset; + size_t nextOffset = offset; + + char *endOfBuffer = "END OF BUFFER\n"; + char *aboutToInsert = "ABOUT TO INSERT\n"; // Flush stdout so our writes here don't race with buffer flushes if (fflush(stdout) != 0) { @@ -208,24 +277,71 @@ static int dump_file(int fd, size_t bytes, off_t offset) { return -1; } - read_rem = bytes; while (read_rem > 0) { - if ((nread = read(fd, buffer, min(read_rem, BUFSIZE))) < 0) { - ERROR_ERRNO("Error reading from output file"); - return -1; + ssize_t nread; + char *scanCursor; + + currentOffset = nextOffset; + memset(buffer, 0, BUFSIZE); + + if ((nread = read(fd, buffer, min(read_rem, BUFSIZE))) < 0) { + ERROR_ERRNO("Error reading from output file"); + return -1; + } + read_rem -= nread; + nextOffset += nread; // offset of currently read buffer in the file + scanCursor = buffer; + + // pace through timestamps that fall into the buffer + while (currentStamp < timestampCount && + timestampMap[currentStamp].offset < nextOffset) { + char *nextEol = strchr(scanCursor, '\n'); + if (!nextEol) { // no line break found in read buffer to insert timestamp + break; } - write_rem = nread; - write_base = buffer; - while (write_rem > 0) { - if ((nwritten = write(STDOUT_FILENO, write_base, write_rem)) < 0) { - ERROR_ERRNO("Error writing output"); - return -1; - } - write_rem -= nwritten; - write_base += nwritten; + + // no timestamp at EOF because the scores are on the last line + size_t eolOffset = currentOffset + (nextEol - buffer); + if (eolOffset + 1 == outputFileSize) { + WRITE_BUFFER(endOfBuffer, strlen(endOfBuffer)); // xxx remove + break; } - read_rem -= nread; - } + + // write the stuff up to the line break + WRITE_BUFFER(scanCursor, nextEol - scanCursor + 1); // write up to \n + // WRITE_BUFFER(aboutToInsert, strlen(aboutToInsert)); + timestampInserted = 1; + currentOffset += nextEol - scanCursor + 1; + + // write the timestamp + char stampInsert[200]; + sprintf(stampInsert, "...[timestamp inserted by autodriver: %s @ %lu]...\n", + getTimestamp(timestampMap[currentStamp].time), + timestampMap[currentStamp].offset); + WRITE_BUFFER(stampInsert, strlen(stampInsert)); + scanCursor = nextEol + 1; + + // skip the timestamps that lead up to the line break + while (currentStamp + 1 < timestampCount) { + currentStamp++; + + if (timestampMap[currentStamp].offset > eolOffset) { + break; + } + + char stampInsert[200]; + sprintf(stampInsert, "skip timestamp %s @ %lu\n", + getTimestamp(timestampMap[currentStamp].time), + timestampMap[currentStamp].offset); + WRITE_BUFFER(stampInsert, strlen(stampInsert)); + } + } // while loop through all stamps in read buffer + + if (currentOffset < nextOffset) { + WRITE_BUFFER(scanCursor, nextOffset - currentOffset); // write rest of buffer + WRITE_BUFFER(endOfBuffer, strlen(endOfBuffer)); + } + } // while loop finish reading return 0; } @@ -357,13 +473,12 @@ static void dump_output(void) { exit(EXIT_OSERROR); } - // xxx insert time stamps. - struct stat stat; if (fstat(outfd, &stat) < 0) { ERROR_ERRNO("Error stating output file"); exit(EXIT_OSERROR); } + outputFileSize = stat.st_size; // Truncate output if we have to if (args.osize > 0 && stat.st_size > args.osize) { @@ -372,12 +487,10 @@ static void dump_output(void) { exit(EXIT_OSERROR); } MESSAGE("\n...[excess bytes elided by autodriver]...\n"); + outputTruncated = 1; if (dump_file(outfd, part_size, stat.st_size - part_size) < 0) { exit(EXIT_OSERROR); } - - // xxx message indicating file has been truncated - } else { if (dump_file(outfd, stat.st_size, 0) < 0) { exit(EXIT_OSERROR); @@ -429,6 +542,8 @@ static void cleanup(void) { try++; } + exit(0); // remove xxx + // Delete all of the files owned by the user in ~user, /tmp, /var/tmp // We are currently in ~user. // (Note by @mpandya: the find binary is in /bin in RHEL but in /usr/bin @@ -441,47 +556,6 @@ static void cleanup(void) { } } -// pthread function, keep a map of timestamp and user's output file size -timestamp_map_t *timestampMap = NULL; -unsigned timestampCount = 0; -void *timestampFunc() { - sleep(10); // wait a bit for the output file to be created by child process - while (1) { - if (timestampCount % TIMESTAMP_MAP_CHUNK_SIZE == 0) { - timestamp_map_t *newBuffer = - realloc(timestampMap, - sizeof(timestamp_map_t) * (TIMESTAMP_MAP_CHUNK_SIZE + timestampCount)); - // printf("allocate %d items\n", (TIMESTAMP_MAP_CHUNK_SIZE + timestampCount)); - if (!newBuffer){ - ERROR_ERRNO("Failed to allocate timestamp map. Current map size %d", - timestampCount); - exit(EXIT_OSERROR); - } - timestampMap = newBuffer; - newBuffer += timestampCount; - memset(newBuffer, 0, sizeof(timestamp_map_t) * TIMESTAMP_MAP_CHUNK_SIZE); - } - - int outfd; - if ((outfd = open(OUTPUT_FILE, O_RDONLY)) < 0) { - ERROR_ERRNO("Error opening output file"); - exit(EXIT_OSERROR); - } - struct stat buf; - fstat(outfd, &buf); - timestampMap[timestampCount].time = time(NULL); - timestampMap[timestampCount].cursor = buf.st_size; - timestampCount++; - if (close(outfd) < 0) { - ERROR_ERRNO("Error closing output file"); - exit(EXIT_OSERROR); - } - sleep(args.timestamp_interval); - } - - return NULL; -} - /** * @brief Monitors the progression of the child * @@ -683,13 +757,6 @@ int main(int argc, char **argv) { tzset(); MESSAGE("Time zone %s:%s", tzname[0], tzname[1]); - /* - ERROR("test test"); - MESSAGE("%s %d interval %d", "abc", 123, args.timestamp_interval); - TIMESTAMP(); - exit(1); - */ - setup_dir(); // Block SIGCHLD to make sure monitor_child recieves it. From 3c0dd2f4ae9234a791d13b312d92624d224e2010 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 17 Jan 2018 15:38:54 -0500 Subject: [PATCH 061/131] checkpoint autodriver, after file creation being moved to parent process --- autodriver/autodriver.c | 97 +++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 53 deletions(-) diff --git a/autodriver/autodriver.c b/autodriver/autodriver.c index ea80eaa2..c66736a5 100644 --- a/autodriver/autodriver.c +++ b/autodriver/autodriver.c @@ -114,9 +114,7 @@ unsigned timestampCount = 0; unsigned currentStamp = 0; size_t outputFileSize = 0; - -int outputTruncated = 0; -int timestampInserted = 0; +int output_fd; // OUTPUT_FILE created/opened by main process, used by child /** * @brief Parses a string into an unsigned integer. @@ -190,7 +188,6 @@ static int parse_user(char *name, struct passwd *user_info, char **buf) { // pthread function, keep a map of timestamp and user's output file size void *timestampFunc() { - sleep(10); // wait a bit for the output file to be created by child process while (1) { if (timestampCount % TIMESTAMP_MAP_CHUNK_SIZE == 0) { timestamp_map_t *newBuffer = @@ -218,7 +215,7 @@ void *timestampFunc() { timestampMap[timestampCount].offset = buf.st_size; timestampCount++; if (close(outfd) < 0) { - ERROR_ERRNO("Error closing output file"); + ERROR_ERRNO("Error closing output file by timestamp thread"); exit(EXIT_OSERROR); } sleep(args.timestamp_interval); @@ -227,7 +224,7 @@ void *timestampFunc() { return NULL; } -int writeBuffer(char *buffer, size_t nBytes) { +int writeBuffer(char *buffer, size_t nBytes) { // nBytes can be zero (no-op) ssize_t nwritten; size_t write_rem; char *write_base; @@ -258,13 +255,11 @@ int writeBuffer(char *buffer, size_t nBytes) { * @return 0 on success, -1 on failure */ static int dump_file(int fd, size_t bytes, off_t offset) { - char buffer[BUFSIZE]; size_t read_rem = bytes; - size_t currentOffset = offset; size_t nextOffset = offset; - char *endOfBuffer = "END OF BUFFER\n"; - char *aboutToInsert = "ABOUT TO INSERT\n"; + char *endOfBuffer = "\nEND OF BUFFER\n"; + char *aboutToInsert = "ABOUT TO INSERT\n"; // xxx remove // Flush stdout so our writes here don't race with buffer flushes if (fflush(stdout) != 0) { @@ -278,40 +273,48 @@ static int dump_file(int fd, size_t bytes, off_t offset) { } while (read_rem > 0) { + char buffer[BUFSIZE]; ssize_t nread; - char *scanCursor; + size_t bufferOffset = nextOffset; - currentOffset = nextOffset; memset(buffer, 0, BUFSIZE); - if ((nread = read(fd, buffer, min(read_rem, BUFSIZE))) < 0) { ERROR_ERRNO("Error reading from output file"); return -1; } read_rem -= nread; nextOffset += nread; // offset of currently read buffer in the file - scanCursor = buffer; + char *scanCursor = buffer; + size_t eolOffset = 0; // pace through timestamps that fall into the buffer while (currentStamp < timestampCount && timestampMap[currentStamp].offset < nextOffset) { - char *nextEol = strchr(scanCursor, '\n'); - if (!nextEol) { // no line break found in read buffer to insert timestamp - break; + + // there might be unused timestamps from last read buffer or before last eol + if (timestampMap[currentStamp].offset < bufferOffset || + timestampMap[currentStamp].offset <= eolOffset) { + currentStamp++; + continue; } - // no timestamp at EOF because the scores are on the last line - size_t eolOffset = currentOffset + (nextEol - buffer); - if (eolOffset + 1 == outputFileSize) { - WRITE_BUFFER(endOfBuffer, strlen(endOfBuffer)); // xxx remove + char *eolSearchStart = timestampMap[currentStamp].offset - bufferOffset + buffer; + char *nextEol = strchr(eolSearchStart, '\n'); + if (!nextEol) { // no line break found in read buffer to insert timestamp break; } // write the stuff up to the line break WRITE_BUFFER(scanCursor, nextEol - scanCursor + 1); // write up to \n // WRITE_BUFFER(aboutToInsert, strlen(aboutToInsert)); - timestampInserted = 1; - currentOffset += nextEol - scanCursor + 1; + scanCursor = nextEol + 1; + + // no timestamp at EOF, because the test scores are on the last line + eolOffset = bufferOffset + (nextEol - buffer); + if (eolOffset + 1 >= outputFileSize) { + WRITE_BUFFER(endOfBuffer, strlen(endOfBuffer)); // xxx remove + break; + } // write the timestamp char stampInsert[200]; @@ -319,28 +322,11 @@ static int dump_file(int fd, size_t bytes, off_t offset) { getTimestamp(timestampMap[currentStamp].time), timestampMap[currentStamp].offset); WRITE_BUFFER(stampInsert, strlen(stampInsert)); - scanCursor = nextEol + 1; - - // skip the timestamps that lead up to the line break - while (currentStamp + 1 < timestampCount) { - currentStamp++; + currentStamp++; + } // while loop through the stamps in read buffer - if (timestampMap[currentStamp].offset > eolOffset) { - break; - } - - char stampInsert[200]; - sprintf(stampInsert, "skip timestamp %s @ %lu\n", - getTimestamp(timestampMap[currentStamp].time), - timestampMap[currentStamp].offset); - WRITE_BUFFER(stampInsert, strlen(stampInsert)); - } - } // while loop through all stamps in read buffer - - if (currentOffset < nextOffset) { - WRITE_BUFFER(scanCursor, nextOffset - currentOffset); // write rest of buffer - WRITE_BUFFER(endOfBuffer, strlen(endOfBuffer)); - } + WRITE_BUFFER(scanCursor, nread - (scanCursor - buffer)); + // WRITE_BUFFER(endOfBuffer, strlen(endOfBuffer)); } // while loop finish reading return 0; @@ -482,12 +468,12 @@ static void dump_output(void) { // Truncate output if we have to if (args.osize > 0 && stat.st_size > args.osize) { + MESSAGE("Output too large -- will be elided in the middle"); unsigned part_size = args.osize / 2; if (dump_file(outfd, part_size, 0) < 0) { exit(EXIT_OSERROR); } MESSAGE("\n...[excess bytes elided by autodriver]...\n"); - outputTruncated = 1; if (dump_file(outfd, part_size, stat.st_size - part_size) < 0) { exit(EXIT_OSERROR); } @@ -497,7 +483,7 @@ static void dump_output(void) { } } if (close(outfd) < 0) { - ERROR_ERRNO("Error closing output file"); + ERROR_ERRNO("Error closing output file by parent process"); exit(EXIT_OSERROR); } } @@ -666,12 +652,7 @@ static void run_job(void) { } // Redirect output - int fd; - if ((fd = open(OUTPUT_FILE, O_WRONLY | O_CREAT | O_TRUNC | O_SYNC, // no buffering - S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) < 0) { - ERROR_ERRNO("Error creating output file"); - exit(EXIT_OSERROR); - } + int fd = output_fd; if (dup2(fd, STDOUT_FILENO) < 0) { ERROR_ERRNO("Error redirecting standard output"); @@ -684,7 +665,7 @@ static void run_job(void) { } if (close(fd) < 0) { - ERROR_ERRNO("Error closing output file"); + ERROR_ERRNO("Error closing output file by child process"); exit(EXIT_OSERROR); } @@ -765,6 +746,12 @@ int main(int argc, char **argv) { sigaddset(&sigset, SIGCHLD); sigprocmask(SIG_BLOCK, &sigset, NULL); + if ((output_fd = open(OUTPUT_FILE, O_WRONLY | O_CREAT | O_TRUNC | O_SYNC, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) < 0) { + ERROR_ERRNO("Error creating output file"); + exit(EXIT_OSERROR); + } + pid_t pid = fork(); if (pid < 0) { ERROR_ERRNO("Unable to fork"); @@ -772,6 +759,10 @@ int main(int argc, char **argv) { } else if (pid == 0) { run_job(); } else { + if (close(output_fd) < 0) { + ERROR_ERRNO("Error closing output file by main process"); + exit(EXIT_OSERROR); + } monitor_child(pid); } From 0cefe52ded36fd0ca15e8dafecdd8b038bcf7b67 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 23 Jan 2018 17:27:27 -0500 Subject: [PATCH 062/131] Cleanup in autodriver --- autodriver/autodriver.c | 49 +++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/autodriver/autodriver.c b/autodriver/autodriver.c index c66736a5..77585014 100644 --- a/autodriver/autodriver.c +++ b/autodriver/autodriver.c @@ -82,8 +82,6 @@ char * getTimestamp(time_t t) { /* Number of seconds to wait in between pkills */ #define SHUTDOWN_GRACE_TIME 3 -error_t argp_err_exit_status = EXIT_USAGE; - /** * @brief A structure containing all of the user-configurable settings */ @@ -106,12 +104,10 @@ typedef struct { size_t offset; } timestamp_map_t; -// #define TIMESTAMP_MAP_CHUNK_SIZE 1024 xxx put it back -#define TIMESTAMP_MAP_CHUNK_SIZE 10 +#define TIMESTAMP_MAP_CHUNK_SIZE 1024 -timestamp_map_t *timestampMap = NULL; +timestamp_map_t *timestampMap = NULL; // remember time@offset of output file unsigned timestampCount = 0; -unsigned currentStamp = 0; size_t outputFileSize = 0; int output_fd; // OUTPUT_FILE created/opened by main process, used by child @@ -186,14 +182,14 @@ static int parse_user(char *name, struct passwd *user_info, char **buf) { return 0; } -// pthread function, keep a map of timestamp and user's output file size +// pthread function, keep a map of timestamp and user's output file offset void *timestampFunc() { while (1) { + // allocate/reallocate space to create/grow the map if (timestampCount % TIMESTAMP_MAP_CHUNK_SIZE == 0) { timestamp_map_t *newBuffer = realloc(timestampMap, sizeof(timestamp_map_t) * (TIMESTAMP_MAP_CHUNK_SIZE + timestampCount)); - // printf("allocate %d items\n", (TIMESTAMP_MAP_CHUNK_SIZE + timestampCount)); if (!newBuffer){ ERROR_ERRNO("Failed to allocate timestamp map. Current map size %d", timestampCount); @@ -218,6 +214,7 @@ void *timestampFunc() { ERROR_ERRNO("Error closing output file by timestamp thread"); exit(EXIT_OSERROR); } + sleep(args.timestamp_interval); } @@ -225,12 +222,10 @@ void *timestampFunc() { } int writeBuffer(char *buffer, size_t nBytes) { // nBytes can be zero (no-op) - ssize_t nwritten; - size_t write_rem; - char *write_base; + ssize_t nwritten = 0; + size_t write_rem = nBytes; + char *write_base = buffer; - write_rem = nBytes; - write_base = buffer; while (write_rem > 0) { if ((nwritten = write(STDOUT_FILENO, write_base, write_rem)) < 0) { ERROR_ERRNO("Error writing output"); @@ -255,11 +250,14 @@ int writeBuffer(char *buffer, size_t nBytes) { // nBytes can be zero (no-op) * @return 0 on success, -1 on failure */ static int dump_file(int fd, size_t bytes, off_t offset) { + static unsigned currentStamp = 0; size_t read_rem = bytes; size_t nextOffset = offset; - char *endOfBuffer = "\nEND OF BUFFER\n"; - char *aboutToInsert = "ABOUT TO INSERT\n"; // xxx remove + if (offset) { // second part of output file, after truncating in the middle + char *msg = "\n...[excess bytes elided by autodriver]...\n"; + WRITE_BUFFER(msg, strlen(msg)); + } // Flush stdout so our writes here don't race with buffer flushes if (fflush(stdout) != 0) { @@ -306,27 +304,25 @@ static int dump_file(int fd, size_t bytes, off_t offset) { // write the stuff up to the line break WRITE_BUFFER(scanCursor, nextEol - scanCursor + 1); // write up to \n - // WRITE_BUFFER(aboutToInsert, strlen(aboutToInsert)); scanCursor = nextEol + 1; // no timestamp at EOF, because the test scores are on the last line eolOffset = bufferOffset + (nextEol - buffer); if (eolOffset + 1 >= outputFileSize) { - WRITE_BUFFER(endOfBuffer, strlen(endOfBuffer)); // xxx remove break; } // write the timestamp char stampInsert[200]; - sprintf(stampInsert, "...[timestamp inserted by autodriver: %s @ %lu]...\n", + sprintf(stampInsert, + "...[timestamp %s inserted by autodriver at offset ~%lu. Maybe out of sync with output's own timestamps.]...\n", getTimestamp(timestampMap[currentStamp].time), timestampMap[currentStamp].offset); WRITE_BUFFER(stampInsert, strlen(stampInsert)); currentStamp++; - } // while loop through the stamps in read buffer + } // while loop through the stamps falling into read buffer's range WRITE_BUFFER(scanCursor, nread - (scanCursor - buffer)); - // WRITE_BUFFER(endOfBuffer, strlen(endOfBuffer)); } // while loop finish reading return 0; @@ -468,12 +464,12 @@ static void dump_output(void) { // Truncate output if we have to if (args.osize > 0 && stat.st_size > args.osize) { - MESSAGE("Output too large -- will be elided in the middle"); + MESSAGE("Output size %lu > limit %u -- will elide in the middle", + stat.st_size, args.osize); unsigned part_size = args.osize / 2; if (dump_file(outfd, part_size, 0) < 0) { exit(EXIT_OSERROR); } - MESSAGE("\n...[excess bytes elided by autodriver]...\n"); if (dump_file(outfd, part_size, stat.st_size - part_size) < 0) { exit(EXIT_OSERROR); } @@ -528,8 +524,6 @@ static void cleanup(void) { try++; } - exit(0); // remove xxx - // Delete all of the files owned by the user in ~user, /tmp, /var/tmp // We are currently in ~user. // (Note by @mpandya: the find binary is in /bin in RHEL but in /usr/bin @@ -595,6 +589,10 @@ static int monitor_child(pid_t child) { MESSAGE("Job exited with status %d", WEXITSTATUS(status)); } + if (args.timestamp_interval > 0) { + MESSAGE("Timestamps inserted at %d-second or larger intervals, depending output rates", + args.timestamp_interval); + } dump_output(); cleanup(); exit(killed ? EXIT_TIMEOUT : EXIT_SUCCESS); @@ -687,7 +685,7 @@ int main(int argc, char **argv) { args.fsize = 0; args.timeout = 0; args.osize = 0; - args.timestamp_interval = 1; // xxx change back to 30 + args.timestamp_interval = 0; args.timezone = NULL; startTime = time(NULL); @@ -768,4 +766,3 @@ int main(int argc, char **argv) { return 0; } - From 6f429effaab02784c5de073466d2da4e82900776 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 24 Jan 2018 14:52:28 -0500 Subject: [PATCH 063/131] audodriver should not exit for minor errors --- autodriver/autodriver.c | 67 +++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/autodriver/autodriver.c b/autodriver/autodriver.c index 77585014..9fa029e7 100644 --- a/autodriver/autodriver.c +++ b/autodriver/autodriver.c @@ -102,6 +102,7 @@ unsigned long startTime = 0; typedef struct { time_t time; size_t offset; + int offsetJumped; } timestamp_map_t; #define TIMESTAMP_MAP_CHUNK_SIZE 1024 @@ -110,7 +111,8 @@ timestamp_map_t *timestampMap = NULL; // remember time@offset of output file unsigned timestampCount = 0; size_t outputFileSize = 0; -int output_fd; // OUTPUT_FILE created/opened by main process, used by child +int child_output_fd; // OUTPUT_FILE created/opened by main process, used by child +int parent_output_fd; // OUTPUT_FILE created/opened by main process, used by parent /** * @brief Parses a string into an unsigned integer. @@ -184,6 +186,7 @@ static int parse_user(char *name, struct passwd *user_info, char **buf) { // pthread function, keep a map of timestamp and user's output file offset void *timestampFunc() { + // time_t lastIntervalStamp = time(NULL); while (1) { // allocate/reallocate space to create/grow the map if (timestampCount % TIMESTAMP_MAP_CHUNK_SIZE == 0) { @@ -193,27 +196,36 @@ void *timestampFunc() { if (!newBuffer){ ERROR_ERRNO("Failed to allocate timestamp map. Current map size %d", timestampCount); - exit(EXIT_OSERROR); + continue; // continue without allocation } timestampMap = newBuffer; newBuffer += timestampCount; memset(newBuffer, 0, sizeof(timestamp_map_t) * TIMESTAMP_MAP_CHUNK_SIZE); } - int outfd; - if ((outfd = open(OUTPUT_FILE, O_RDONLY)) < 0) { - ERROR_ERRNO("Error opening output file"); - exit(EXIT_OSERROR); - } struct stat buf; - fstat(outfd, &buf); + if (parent_output_fd <= 0 || fstat(parent_output_fd, &buf) < 0) { + ERROR_ERRNO("Error statting output file to read offset"); + continue; // simply skip this time + } + + /* + time_t currentTime = time(NULL); + int addStamp = 0; + if (currentTime - lastIntervalStamp >= args.timestamp_interval) { + addStamp = 1; + lastIntervalStamp = currentTime; + } + */ + timestampMap[timestampCount].time = time(NULL); timestampMap[timestampCount].offset = buf.st_size; timestampCount++; - if (close(outfd) < 0) { - ERROR_ERRNO("Error closing output file by timestamp thread"); - exit(EXIT_OSERROR); - } + + /* + printf("current time %lu\n", time(NULL)); + sleep(1); + */ sleep(args.timestamp_interval); } @@ -451,13 +463,13 @@ static void setup_dir(void) { static void dump_output(void) { int outfd; if ((outfd = open(OUTPUT_FILE, O_RDONLY)) < 0) { - ERROR_ERRNO("Error opening output file"); + ERROR_ERRNO("Error opening output file at the end of test"); exit(EXIT_OSERROR); } struct stat stat; if (fstat(outfd, &stat) < 0) { - ERROR_ERRNO("Error stating output file"); + ERROR_ERRNO("Error statting output file"); exit(EXIT_OSERROR); } outputFileSize = stat.st_size; @@ -479,7 +491,7 @@ static void dump_output(void) { } } if (close(outfd) < 0) { - ERROR_ERRNO("Error closing output file by parent process"); + ERROR_ERRNO("Error closing output file at the end of test"); exit(EXIT_OSERROR); } } @@ -498,7 +510,7 @@ static int kill_processes(char *sig) { if ((ret = call_program("/usr/bin/pkill", pkill_args)) > 1) { ERROR("Error killing user processes"); - exit(EXIT_OSERROR); + // don't quit. Let the caller decide } return ret; } @@ -509,6 +521,10 @@ static int kill_processes(char *sig) { * Kills all processes and deletes all files */ static void cleanup(void) { + if (parent_output_fd <= 0 || close(parent_output_fd) < 0) { + ERROR_ERRNO("Error closing output file before cleanup"); + } + // Kill all of the user's processes int ret; int try = 0; @@ -518,7 +534,7 @@ static void cleanup(void) { sleep(SHUTDOWN_GRACE_TIME); if (try > MAX_KILL_ATTEMPTS) { ERROR("Gave up killing user processes"); - exit(EXIT_OSERROR); + break; // continue to cleanup with best effort } ret = kill_processes("-KILL"); try++; @@ -650,7 +666,7 @@ static void run_job(void) { } // Redirect output - int fd = output_fd; + int fd = child_output_fd; if (dup2(fd, STDOUT_FILENO) < 0) { ERROR_ERRNO("Error redirecting standard output"); @@ -744,7 +760,7 @@ int main(int argc, char **argv) { sigaddset(&sigset, SIGCHLD); sigprocmask(SIG_BLOCK, &sigset, NULL); - if ((output_fd = open(OUTPUT_FILE, O_WRONLY | O_CREAT | O_TRUNC | O_SYNC, + if ((child_output_fd = open(OUTPUT_FILE, O_WRONLY | O_CREAT | O_TRUNC | O_SYNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) < 0) { ERROR_ERRNO("Error creating output file"); exit(EXIT_OSERROR); @@ -757,10 +773,17 @@ int main(int argc, char **argv) { } else if (pid == 0) { run_job(); } else { - if (close(output_fd) < 0) { - ERROR_ERRNO("Error closing output file by main process"); - exit(EXIT_OSERROR); + if (close(child_output_fd) < 0) { + ERROR_ERRNO("Error closing output file by parent process"); + // don't quit for this type of error } + + // open output file read only to build timestamp:offset map + if ((parent_output_fd = open(OUTPUT_FILE, O_RDONLY)) < 0) { + ERROR_ERRNO("Error opening output file by parent process"); + // don't quit for this type of error + } + monitor_child(pid); } From 95f85743a7d3c8ff2e03980206dca4e9d2b64b6e Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 26 Jan 2018 15:07:04 -0500 Subject: [PATCH 064/131] Ready to be made into an image. --- autodriver/autodriver.c | 48 +++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/autodriver/autodriver.c b/autodriver/autodriver.c index 9fa029e7..8dda211e 100644 --- a/autodriver/autodriver.c +++ b/autodriver/autodriver.c @@ -102,7 +102,6 @@ unsigned long startTime = 0; typedef struct { time_t time; size_t offset; - int offsetJumped; } timestamp_map_t; #define TIMESTAMP_MAP_CHUNK_SIZE 1024 @@ -186,8 +185,12 @@ static int parse_user(char *name, struct passwd *user_info, char **buf) { // pthread function, keep a map of timestamp and user's output file offset void *timestampFunc() { - // time_t lastIntervalStamp = time(NULL); + time_t lastStamp = 0; + int lastJumpIndex = -1; + while (1) { + sleep(1); + // allocate/reallocate space to create/grow the map if (timestampCount % TIMESTAMP_MAP_CHUNK_SIZE == 0) { timestamp_map_t *newBuffer = @@ -209,25 +212,28 @@ void *timestampFunc() { continue; // simply skip this time } - /* + size_t currentOffset = buf.st_size; time_t currentTime = time(NULL); - int addStamp = 0; - if (currentTime - lastIntervalStamp >= args.timestamp_interval) { - addStamp = 1; - lastIntervalStamp = currentTime; - } - */ - timestampMap[timestampCount].time = time(NULL); - timestampMap[timestampCount].offset = buf.st_size; - timestampCount++; + // record following timestamps: + // 1. enough time has passed since last timestamp or + // 2. output has grown and enough time has passed since last offset change - /* - printf("current time %lu\n", time(NULL)); - sleep(1); - */ + if (timestampCount == 0 || + timestampMap[timestampCount - 1].offset != currentOffset) { + if (lastJumpIndex >= 0 && + currentTime - timestampMap[lastJumpIndex].time < args.timestamp_interval) { + continue; + } + lastJumpIndex = timestampCount; + } else if (currentTime - lastStamp < args.timestamp_interval) { + continue; + } - sleep(args.timestamp_interval); + lastStamp = currentTime; + timestampMap[timestampCount].time = currentTime; + timestampMap[timestampCount].offset = currentOffset; + timestampCount++; } return NULL; @@ -568,7 +574,7 @@ static int monitor_child(pid_t child) { // create a thread for for file size tracking by time interval pthread_t timestampThread = 0; // this thread needs no cancellation - if (args.timestamp_interval) { + if (args.timestamp_interval > 0) { if (pthread_create(×tampThread, NULL, timestampFunc, NULL)) { ERROR_ERRNO("Failed to create timestamp thread"); exit(EXIT_OSERROR); @@ -599,14 +605,14 @@ static int monitor_child(pid_t child) { exit(EXIT_OSERROR); } - MESSAGE("Duration of test is %lu seconds", time(NULL) - startTime); + MESSAGE("Test terminates. Duration: %lu seconds", time(NULL) - startTime); if (!killed) { MESSAGE("Job exited with status %d", WEXITSTATUS(status)); } if (args.timestamp_interval > 0) { - MESSAGE("Timestamps inserted at %d-second or larger intervals, depending output rates", + MESSAGE("Timestamps inserted at %d-second or larger intervals, depending on output rates", args.timestamp_interval); } dump_output(); @@ -750,7 +756,7 @@ int main(int argc, char **argv) { putenv(tz); } tzset(); - MESSAGE("Time zone %s:%s", tzname[0], tzname[1]); + MESSAGE("Test Starts. Time zone %s:%s", tzname[0], tzname[1]); setup_dir(); From dc22fd5abdfbf60d289672f9164c1f5d26b33f2c Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 26 Jan 2018 16:44:21 -0500 Subject: [PATCH 065/131] Add an output generator to help test autodriver. --- autodriver/test/Makefile | 20 +++++++++++++++++--- autodriver/test/README | 8 ++++++++ autodriver/test/output_gen.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 autodriver/test/README create mode 100644 autodriver/test/output_gen.c diff --git a/autodriver/test/Makefile b/autodriver/test/Makefile index c5859d96..41745a35 100644 --- a/autodriver/test/Makefile +++ b/autodriver/test/Makefile @@ -1,3 +1,17 @@ -autograde: - id - sleep 5 +CC = gcc +CFLAGS = -W -Wall -Wextra + +OBJS = output_gen.o + +all: output_gen run_output_gen + +output_gen: $(OBJS) + $(CC) $(LDFLAGS) -o output_gen $(OBJS) + +clean: + rm -f *.o output_gen + +.PHONY: clean + +run_output_gen: + output_gen diff --git a/autodriver/test/README b/autodriver/test/README new file mode 100644 index 00000000..790e37fd --- /dev/null +++ b/autodriver/test/README @@ -0,0 +1,8 @@ +How to test autodriver: First create a user "autograde" on the test machine. + +cd Tango/autodriver +make +cp -r test tmp +autodriver tmp + + diff --git a/autodriver/test/output_gen.c b/autodriver/test/output_gen.c new file mode 100644 index 00000000..19e3e98f --- /dev/null +++ b/autodriver/test/output_gen.c @@ -0,0 +1,30 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include + +int main() { + putenv("TZ=America/New_York"); + tzset(); + + int i, k; + char timeStr[100]; + for (k = 0; k < 100; k++) { + for (i = 0; i < 200; i++) { + time_t ltime = time(NULL); + struct tm* tmInfo = localtime(<ime); + strftime(timeStr, 100, "%Y%m%d-%H:%M:%S", tmInfo); + printf("TIME: \"%s\"\n", timeStr); + int j; + for (j = 0; j < 10; j++) { + printf("=%1d-0123456789", j); + } + printf("\n"); + } + sleep(1); + } + sleep(5); + exit(0); +} From 4b1439a77e02d1d9dc0262c1b563f2ff4995be58 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 26 Jan 2018 16:46:37 -0500 Subject: [PATCH 066/131] Add pthread lib into build. --- autodriver/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/autodriver/Makefile b/autodriver/Makefile index 203ca231..9fde9293 100644 --- a/autodriver/Makefile +++ b/autodriver/Makefile @@ -1,5 +1,6 @@ CC = gcc CFLAGS = -W -Wall -Wextra +LDFLAGS = -pthread OBJS = autodriver.o From bd174295c259757e733359028713daa385510ba0 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 26 Jan 2018 16:48:13 -0500 Subject: [PATCH 067/131] remove old experiment file. --- autodriver/try.c | 183 ----------------------------------------------- 1 file changed, 183 deletions(-) delete mode 100644 autodriver/try.c diff --git a/autodriver/try.c b/autodriver/try.c deleted file mode 100644 index 7a573180..00000000 --- a/autodriver/try.c +++ /dev/null @@ -1,183 +0,0 @@ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int filedes[2]; - -void childFunc() { - int i; - char j; - char buffer[10000]; - for (i = 0; i < 1000; i++) { - for (j = '0'; j <= '9'; j++) { - buffer[ i * 10 + (j - '0') ] = j; - } - } - buffer[9999] = '\0'; - printf("%s", buffer); - - /* - for (i = 0; i < 5; i++) { - system("date"); - fprintf(stdout, "stdout in child\n"); - int eol = rand() % 2; // boolean - if (eol) { - fprintf(stderr, "stderr with eol in child\n"); - } else { - fprintf(stderr, "stderr without eol in child"); - } - //sleep(3); - } - */ - - _exit(1); -} - -void* readFunc() { - char buffer[70]; - - int timestampAfterNextRead = 1; // boolean - while (1) { - memset(&buffer[0], 0, sizeof(buffer)); - ssize_t count = read(filedes[0], buffer, sizeof(buffer) - 1); - - - if (count == -1) { - if (errno == EINTR) { - continue; - } else { - perror("read"); - exit(1); - } - } else if (count == 0) { - fprintf(stderr, "exit \n"); - break; - } else { - // int insertNull = rand() % 2; // boolean - // int insertIndex = rand() % count; - int processedCount = 0; - int addTimestamp = timestampAfterNextRead; - - fprintf(stderr, "\n====================================\n"); - fprintf(stderr, "### read %lu bytes: \"%s\"\n", count, buffer); - timestampAfterNextRead = (buffer[count - 1] == '\n'); // boolean - - /* - fprintf(stderr, "### random insert index %d %d, \"%s\", \"%s\"\n", - insertNull, insertIndex, buffer, &buffer[insertIndex]); - if (insertNull) { - buffer[insertIndex] = '\0'; - } - */ - - char *result = strtok(buffer, "\n"); - while (1) { - if (!result) { - if (processedCount < count) { // must have seen a NULL - if (!buffer[processedCount]) { - processedCount++; - } - addTimestamp = 1; // null is dealt like \n - result = strtok(&buffer[processedCount], "\n"); - fprintf(stderr, "### processed after null \"%s\"\n", &buffer[processedCount]); - continue; - } - break; - } - - fprintf(stderr, "### result \"%s\" %lu, \"%s\"\n", result, strlen(result), - &buffer[processedCount]); - processedCount += strlen(result) + 1; - char *eol = (processedCount >= count && !timestampAfterNextRead) ? "" : "\n"; - fprintf(stderr, (eol[0] == '\n') ? "Add eol\n" : "Not add eol\n"); - - assert(processedCount <= count + 1); - - time_t ltime = time(NULL); - struct tm* tmInfo = localtime(<ime); - char timeStr[100]; - if (addTimestamp) { - strftime(timeStr, 100, "%Y%m%d-%H:%M:%S", tmInfo); - printf("%s: \"%s\"%s", timeStr, result, eol); - } else { - printf("\"%s\"%s", result, eol); - } - - addTimestamp = 1; - result = strtok(NULL, "\n"); - } - - addTimestamp = (buffer[count - 1] == '\n'); // boolean - } - } - return NULL; -} - -int main() { - putenv("TZ=America/New_York"); - tzset(); - - if (pipe(filedes) == -1) { - perror("pipe"); - exit(1); - } - - pid_t pid = fork(); - if (pid == -1) { - perror("fork"); - exit(1); - } else if (pid == 0) { - setvbuf(stdout, NULL, _IONBF, 0); - while ((dup2(filedes[1], STDOUT_FILENO) == -1) && (errno == EINTR)) {} - while ((dup2(filedes[1], STDERR_FILENO) == -1) && (errno == EINTR)) {} - close(filedes[1]); - close(filedes[0]); - childFunc(); - } - - // parent process comes here - pthread_t readThread; - - /* create a second thread which executes inc_x(&x) */ - if(pthread_create(&readThread, NULL, readFunc, NULL)) { - perror("create thread"); - exit(1); - } - - /* - result = waitpid(pid, &status, WNOHANG); - if (result == 0) { - printf("child done\n"); - break; - } else if (result < 0) { - perror("waitpid"); - exit(1); - } - printf("wait pid %d\n", result); - */ - - { - int status; - wait(&status); - - sleep(10); - - status = pthread_cancel(readThread); - close(filedes[0]); - } -} From daaa912ce53938e4e6e3baa8f88f1c06ed2d9517 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 30 Jan 2018 13:48:53 -0500 Subject: [PATCH 068/131] Add autodriver timestamp interval in tango config. --- config.template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.template.py b/config.template.py index 84124bce..5f72fe1c 100644 --- a/config.template.py +++ b/config.template.py @@ -70,7 +70,7 @@ class Config: COPYOUT_TIMEOUT = 30 # time zone and timestamp report interval for autodriver execution - AUTODRIVER_LOGGING_TIME_ZONE = "UTC" # e.g. "America/New_York". NULL => UTC + AUTODRIVER_LOGGING_TIME_ZONE = "UTC" # e.g. "America/New_York". AUTODRIVER_TIMESTAMP_INTERVAL = 0 # in seconds. 0 => no timestamp insersion # Docker constants From 9812622458d9ad4536a8b3e35c24d4c8416fe64a Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 1 Feb 2018 15:48:07 -0500 Subject: [PATCH 069/131] Remove redundant wording in error messages. --- autodriver/autodriver.c | 56 ++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/autodriver/autodriver.c b/autodriver/autodriver.c index 8dda211e..92863273 100644 --- a/autodriver/autodriver.c +++ b/autodriver/autodriver.c @@ -208,7 +208,7 @@ void *timestampFunc() { struct stat buf; if (parent_output_fd <= 0 || fstat(parent_output_fd, &buf) < 0) { - ERROR_ERRNO("Error statting output file to read offset"); + ERROR_ERRNO("Statting output file to read offset"); continue; // simply skip this time } @@ -246,7 +246,7 @@ int writeBuffer(char *buffer, size_t nBytes) { // nBytes can be zero (no-op) while (write_rem > 0) { if ((nwritten = write(STDOUT_FILENO, write_base, write_rem)) < 0) { - ERROR_ERRNO("Error writing output"); + ERROR_ERRNO("Writing output"); return -1; } write_rem -= nwritten; @@ -279,12 +279,12 @@ static int dump_file(int fd, size_t bytes, off_t offset) { // Flush stdout so our writes here don't race with buffer flushes if (fflush(stdout) != 0) { - ERROR_ERRNO("Error flushing standard out"); + ERROR_ERRNO("Flushing standard out"); return -1; } if (lseek(fd, offset, SEEK_SET) < 0) { - ERROR_ERRNO("Error seeking in output file"); + ERROR_ERRNO("Seeking in output file"); return -1; } @@ -295,7 +295,7 @@ static int dump_file(int fd, size_t bytes, off_t offset) { memset(buffer, 0, BUFSIZE); if ((nread = read(fd, buffer, min(read_rem, BUFSIZE))) < 0) { - ERROR_ERRNO("Error reading from output file"); + ERROR_ERRNO("Reading from output file"); return -1; } read_rem -= nread; @@ -443,13 +443,13 @@ static void setup_dir(void) { char *mv_args[] = {"/bin/mv", "-f", args.directory, args.user_info.pw_dir, NULL}; if (call_program("/bin/mv", mv_args) != 0) { - ERROR("Error moving directory"); + ERROR("Moving directory"); exit(EXIT_OSERROR); } // And switch over to that directory if (chdir(args.user_info.pw_dir) < 0) { - ERROR_ERRNO("Error changing directories"); + ERROR_ERRNO("Changing directories"); exit(EXIT_OSERROR); } @@ -458,7 +458,7 @@ static void setup_dir(void) { sprintf(owner, "%d:%d", args.user_info.pw_uid, args.user_info.pw_gid); char *chown_args[] = {"/bin/chown", "-R", owner, args.directory, NULL}; if (call_program("/bin/chown", chown_args) != 0) { - ERROR("Error chowining directory"); + ERROR("Chowining directory"); exit(EXIT_OSERROR); } } @@ -469,13 +469,13 @@ static void setup_dir(void) { static void dump_output(void) { int outfd; if ((outfd = open(OUTPUT_FILE, O_RDONLY)) < 0) { - ERROR_ERRNO("Error opening output file at the end of test"); + ERROR_ERRNO("Opening output file at the end of test"); exit(EXIT_OSERROR); } struct stat stat; if (fstat(outfd, &stat) < 0) { - ERROR_ERRNO("Error statting output file"); + ERROR_ERRNO("Statting output file"); exit(EXIT_OSERROR); } outputFileSize = stat.st_size; @@ -497,7 +497,7 @@ static void dump_output(void) { } } if (close(outfd) < 0) { - ERROR_ERRNO("Error closing output file at the end of test"); + ERROR_ERRNO("Closing output file at the end of test"); exit(EXIT_OSERROR); } } @@ -515,7 +515,7 @@ static int kill_processes(char *sig) { GRADING_USER, NULL}; if ((ret = call_program("/usr/bin/pkill", pkill_args)) > 1) { - ERROR("Error killing user processes"); + ERROR("Killing user processes"); // don't quit. Let the caller decide } return ret; @@ -528,7 +528,7 @@ static int kill_processes(char *sig) { */ static void cleanup(void) { if (parent_output_fd <= 0 || close(parent_output_fd) < 0) { - ERROR_ERRNO("Error closing output file before cleanup"); + ERROR_ERRNO("Closing output file before cleanup"); } // Kill all of the user's processes @@ -553,7 +553,7 @@ static void cleanup(void) { char *find_args[] = {"find", "/usr/bin/find", ".", "/tmp", "/var/tmp", "-user", args.user_info.pw_name, "-delete", NULL}; if (call_program("/usr/bin/env", find_args) != 0) { - ERROR("Error deleting user's files"); + ERROR("Deleting user's files"); exit(EXIT_OSERROR); } } @@ -601,7 +601,7 @@ static int monitor_child(pid_t child) { } if (waitpid(child, &status, 0) < 0) { - ERROR_ERRNO("Error reaping child"); + ERROR_ERRNO("Reaping child"); exit(EXIT_OSERROR); } @@ -640,7 +640,7 @@ static void run_job(void) { if (args.nproc != 0) { struct rlimit rlimit = {args.nproc, args.nproc}; if (setrlimit(RLIMIT_NPROC, &rlimit) < 0) { - perror("Error setting process limit"); + perror("Setting process limit"); exit(EXIT_OSERROR); } } @@ -648,26 +648,26 @@ static void run_job(void) { if (args.fsize != 0) { struct rlimit rlimit = {args.fsize, args.fsize}; if (setrlimit(RLIMIT_FSIZE, &rlimit) < 0) { - ERROR_ERRNO("Error setting filesize limit"); + ERROR_ERRNO("Setting filesize limit"); exit(EXIT_OSERROR); } } // Drop permissions if (initgroups(args.user_info.pw_name, args.user_info.pw_gid) < 0) { - ERROR_ERRNO("Error setting supplementary group IDs"); + ERROR_ERRNO("Setting supplementary group IDs"); exit(EXIT_OSERROR); } if (setresgid(args.user_info.pw_gid, args.user_info.pw_gid, args.user_info.pw_gid) < 0) { - ERROR_ERRNO("Error setting group ID"); + ERROR_ERRNO("Setting group ID"); exit(EXIT_OSERROR); } if (setresuid(args.user_info.pw_uid, args.user_info.pw_uid, args.user_info.pw_uid) < 0) { - ERROR_ERRNO("Error setting user ID"); + ERROR_ERRNO("Setting user ID"); exit(EXIT_OSERROR); } @@ -675,29 +675,29 @@ static void run_job(void) { int fd = child_output_fd; if (dup2(fd, STDOUT_FILENO) < 0) { - ERROR_ERRNO("Error redirecting standard output"); + ERROR_ERRNO("Redirecting standard output"); exit(EXIT_OSERROR); } if (dup2(fd, STDERR_FILENO) < 0) { - ERROR_ERRNO("Error redirecting standard error"); + ERROR_ERRNO("Redirecting standard error"); exit(EXIT_OSERROR); } if (close(fd) < 0) { - ERROR_ERRNO("Error closing output file by child process"); + ERROR_ERRNO("Closing output file by child process"); exit(EXIT_OSERROR); } // Switch into the folder if (chdir(args.directory) < 0) { - ERROR_ERRNO("Error changing directory"); + ERROR_ERRNO("Changing directory"); exit(EXIT_OSERROR); } // Finally exec job execl("/usr/bin/make", "make", NULL); - ERROR_ERRNO("Error executing make"); + ERROR_ERRNO("Eexecuting make"); exit(EXIT_OSERROR); } @@ -768,7 +768,7 @@ int main(int argc, char **argv) { if ((child_output_fd = open(OUTPUT_FILE, O_WRONLY | O_CREAT | O_TRUNC | O_SYNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) < 0) { - ERROR_ERRNO("Error creating output file"); + ERROR_ERRNO("Creating output file"); exit(EXIT_OSERROR); } @@ -780,13 +780,13 @@ int main(int argc, char **argv) { run_job(); } else { if (close(child_output_fd) < 0) { - ERROR_ERRNO("Error closing output file by parent process"); + ERROR_ERRNO("Closing output file by parent process"); // don't quit for this type of error } // open output file read only to build timestamp:offset map if ((parent_output_fd = open(OUTPUT_FILE, O_RDONLY)) < 0) { - ERROR_ERRNO("Error opening output file by parent process"); + ERROR_ERRNO("Opening output file by parent process"); // don't quit for this type of error } From 7351917d2c338a185ac1ee12e828b3b26099ad11 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 2 Feb 2018 16:35:34 -0500 Subject: [PATCH 070/131] chown output file to autograde. notify child thread job finish. --- autodriver/autodriver.c | 46 +++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/autodriver/autodriver.c b/autodriver/autodriver.c index 92863273..75d26e33 100644 --- a/autodriver/autodriver.c +++ b/autodriver/autodriver.c @@ -60,6 +60,9 @@ char * getTimestamp(time_t t) { #define MESSAGE(format, ...) \ printf("Autodriver@%s: " format "\n", getTimestamp(0), ##__VA_ARGS__) +#define NL_MESSAGE(format, ...) \ + printf("\nAutodriver@%s: " format "\n", getTimestamp(0), ##__VA_ARGS__) + #define EXIT__BASE 1 /* Exit codes for use after errors */ @@ -98,6 +101,7 @@ struct arguments { } args; unsigned long startTime = 0; +int childTimedOut = 0; typedef struct { time_t time; @@ -108,10 +112,10 @@ typedef struct { timestamp_map_t *timestampMap = NULL; // remember time@offset of output file unsigned timestampCount = 0; +int childFinished = 0; size_t outputFileSize = 0; int child_output_fd; // OUTPUT_FILE created/opened by main process, used by child -int parent_output_fd; // OUTPUT_FILE created/opened by main process, used by parent /** * @brief Parses a string into an unsigned integer. @@ -187,8 +191,19 @@ static int parse_user(char *name, struct passwd *user_info, char **buf) { void *timestampFunc() { time_t lastStamp = 0; int lastJumpIndex = -1; + int output_fd; + + // open output file read only to build timestamp:offset map + if ((output_fd = open(OUTPUT_FILE, O_RDONLY)) < 0) { + ERROR_ERRNO("Opening output file by parent process"); + // don't quit for this type of error + } while (1) { + if (childFinished) { + break; + } + sleep(1); // allocate/reallocate space to create/grow the map @@ -207,7 +222,7 @@ void *timestampFunc() { } struct stat buf; - if (parent_output_fd <= 0 || fstat(parent_output_fd, &buf) < 0) { + if (output_fd <= 0 || fstat(output_fd, &buf) < 0) { ERROR_ERRNO("Statting output file to read offset"); continue; // simply skip this time } @@ -236,6 +251,9 @@ void *timestampFunc() { timestampCount++; } + if (output_fd <= 0 || close(output_fd) < 0) { + ERROR_ERRNO("Closing output file before cleanup"); + } return NULL; } @@ -458,7 +476,7 @@ static void setup_dir(void) { sprintf(owner, "%d:%d", args.user_info.pw_uid, args.user_info.pw_gid); char *chown_args[] = {"/bin/chown", "-R", owner, args.directory, NULL}; if (call_program("/bin/chown", chown_args) != 0) { - ERROR("Chowining directory"); + ERROR("Chowning directory"); exit(EXIT_OSERROR); } } @@ -527,10 +545,6 @@ static int kill_processes(char *sig) { * Kills all processes and deletes all files */ static void cleanup(void) { - if (parent_output_fd <= 0 || close(parent_output_fd) < 0) { - ERROR_ERRNO("Closing output file before cleanup"); - } - // Kill all of the user's processes int ret; int try = 0; @@ -597,6 +611,7 @@ static int monitor_child(pid_t child) { assert(errno == EAGAIN); kill(child, SIGKILL); killed = 1; + childTimedOut = 1; } } @@ -615,7 +630,13 @@ static int monitor_child(pid_t child) { MESSAGE("Timestamps inserted at %d-second or larger intervals, depending on output rates", args.timestamp_interval); } + + childFinished = 1; dump_output(); + if (childTimedOut) { + NL_MESSAGE("ERROR Job timed out"); // print error again at the end of output + } + cleanup(); exit(killed ? EXIT_TIMEOUT : EXIT_SUCCESS); } @@ -771,6 +792,11 @@ int main(int argc, char **argv) { ERROR_ERRNO("Creating output file"); exit(EXIT_OSERROR); } + // chown output file to user "autograde" + if (fchown(child_output_fd, args.user_info.pw_uid, args.user_info.pw_gid) < 0) { + ERROR_ERRNO("Error chowning output file"); + exit(EXIT_OSERROR); + } pid_t pid = fork(); if (pid < 0) { @@ -784,12 +810,6 @@ int main(int argc, char **argv) { // don't quit for this type of error } - // open output file read only to build timestamp:offset map - if ((parent_output_fd = open(OUTPUT_FILE, O_RDONLY)) < 0) { - ERROR_ERRNO("Opening output file by parent process"); - // don't quit for this type of error - } - monitor_child(pid); } From 1972a17db30214dbd2c8d6cb5b884eae7b5a0fbf Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 2 Feb 2018 17:00:22 -0500 Subject: [PATCH 071/131] Add new arguments for autodriver. Remove the confusing term autograder. --- vmms/distDocker.py | 2 +- vmms/ec2SSH.py | 11 +++++++---- vmms/localDocker.py | 2 +- worker.py | 8 ++++---- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/vmms/distDocker.py b/vmms/distDocker.py index c5726176..8896c4ee 100644 --- a/vmms/distDocker.py +++ b/vmms/distDocker.py @@ -250,7 +250,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): def copyOut(self, vm, destFile): - """ copyOut - Copy the autograder feedback from container to + """ copyOut - Copy the autodriver feedback from container to destFile on the Tango host. Then, destroy that container. Containers are never reused. """ diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index d45bedbd..d0fb5975 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -444,10 +444,13 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): self.instanceName(vm.id, vm.name)) # Setting ulimits for VM and running job runcmd = "/usr/bin/time --output=time.out autodriver -u %d -f %d -t \ - %d -o %d autolab &> output" % (config.Config.VM_ULIMIT_USER_PROC, - config.Config.VM_ULIMIT_FILE_SIZE, - runTimeout, - maxOutputFileSize) + %d -o %d -z %s -i %d autolab &> output" % ( + config.Config.VM_ULIMIT_USER_PROC, + config.Config.VM_ULIMIT_FILE_SIZE, + runTimeout, + maxOutputFileSize, + config.Config.AUTODRIVER_LOGGING_TIME_ZONE, + config.Config.AUTODRIVER_TIMESTAMP_INTERVAL) ret = timeout(["ssh"] + self.ssh_flags + ["%s@%s" % (config.Config.EC2_USER_NAME, domain_name), runcmd], runTimeout * 2) # return 3 # xxx inject error to test KEEP_VM_AFTER_FAILURE diff --git a/vmms/localDocker.py b/vmms/localDocker.py index 45b54145..45565aec 100644 --- a/vmms/localDocker.py +++ b/vmms/localDocker.py @@ -155,7 +155,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): def copyOut(self, vm, destFile): - """ copyOut - Copy the autograder feedback from container to + """ copyOut - Copy the autodriver feedback from container to destFile on the Tango host. Then, destroy that container. Containers are never reused. """ diff --git a/worker.py b/worker.py index 8684aa75..4997ab88 100644 --- a/worker.py +++ b/worker.py @@ -112,19 +112,19 @@ def appendMsg(self, filename, msg): """ appendMsg - Append a timestamped Tango message to a file """ f = open(filename, "a") - f.write("Autograder [%s]: %s\n" % (datetime.now().ctime(), msg)) + f.write("Autolab [%s]: %s\n" % (datetime.now().ctime(), msg)) f.close() def catFiles(self, f1, f2): """ catFiles - cat f1 f2 > f2, where f1 is the Tango header and f2 is the output from the Autodriver """ - self.appendMsg(f1, "Here is the output from the autograder:\n---") + self.appendMsg(f1, "Output of autodriver from grading VM:\n") (wfd, tmpname)=tempfile.mkstemp(dir=os.path.dirname(f2)) wf=os.fdopen(wfd, "a") with open(f1, "rb") as f1fd: shutil.copyfileobj(f1fd, wf) - # f2 may not exist if autograder failed + # f2 may not exist if autodriver failed try: with open(f2, "rb") as f2fd: shutil.copyfileobj(f2fd, wf) @@ -324,7 +324,7 @@ def run(self): self.jobQueue.makeDead(self.job.id, msg) - # Update the text that users see in the autograder output file + # Update the text that users see in the autodriver output file self.appendMsg(hdrfile, msg) self.catFiles(hdrfile, self.job.outputFile) From caac9b46733716ed30feb62646d750a7accdd4f7 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 9 Feb 2018 16:09:37 -0500 Subject: [PATCH 072/131] Streamline logging in worker's important run() function. --- tangoObjects.py | 1 + vmms/ec2SSH.py | 6 +-- worker.py | 107 +++++++++++++++++------------------------------- 3 files changed, 41 insertions(+), 73 deletions(-) diff --git a/tangoObjects.py b/tangoObjects.py index 7d966c86..44c922d2 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -115,6 +115,7 @@ def isNotAssigned(self): return not self.assigned def appendTrace(self, trace_str): + # trace attached to the object can be retrived and sent to rest api caller self.syncRemote() self.trace.append(trace_str) self.updateRemote() diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index d0fb5975..a878f1d9 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -451,11 +451,11 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): maxOutputFileSize, config.Config.AUTODRIVER_LOGGING_TIME_ZONE, config.Config.AUTODRIVER_TIMESTAMP_INTERVAL) + # runTimeout * 2 is a conservative estimate. + # autodriver handles timeout on the target vm. ret = timeout(["ssh"] + self.ssh_flags + ["%s@%s" % (config.Config.EC2_USER_NAME, domain_name), runcmd], runTimeout * 2) - # return 3 # xxx inject error to test KEEP_VM_AFTER_FAILURE return ret - # runTimeout * 2 is a temporary hack. The driver will handle the timout def copyOut(self, vm, destFile): """ copyOut - Copy the file output on the VM to the file @@ -516,7 +516,7 @@ def destroyVM(self, vm): iName = self.instanceName(vm.id, vm.name) self.log.info("Will keep VM %s for further debugging" % iName) instance = self.boto3resource.Instance(vm.ec2_id) - # delete original name tag and replace it with "failed-xxx" + # delete original name tag and replace it with "failed-xyz" # add notes tag for test name tag = self.boto3resource.Tag(vm.ec2_id, "Name", iName) if tag: diff --git a/worker.py b/worker.py index 4997ab88..c23727d7 100644 --- a/worker.py +++ b/worker.py @@ -70,14 +70,6 @@ def rescheduleJob(self, hdrfile, ret, err): of a system error, such as a VM timing out or a connection failure. """ - self.log.error("Job %s:%d failed: %s" % - (self.job.name, self.job.id, err)) - self.job.appendTrace( - "%s|Job %s:%d failed: %s" % - (datetime.now().ctime(), - self.job.name, - self.job.id, - err)) # Try a few times before giving up if self.job.retries < Config.JOB_RETRIES: @@ -151,6 +143,30 @@ def notifyServer(self, job): except Exception as e: self.log.debug("Error in notifyServer: %s" % str(e)) + + def afterJob(hdrfile, msg, stage, rc, vmHandling): + self.jobQueue.makeDead(self.job.id, msg) + + # Update the text that users see in the autodriver output file + self.appendMsg(hdrfile, msg) + self.catFiles(hdrfile, self.job.outputFile) + + # Thread exit after termination + self.detachVM(return_vm=returnVM, replace_vm=replaceVM) + self.notifyServer(self.job) + return + + def jobLogAndTrace(self, stageMsg, vm, status=None): + msg = stageMsg + " %s for job %s:%d" % (self.vmms.instanceName(vm.id, vm.name), + self.job.name, self.job.id) + if (status != None): + if (status == 0): + msg = "done " + msg + else: + msg = "failed " + msg + " (status=%d)" % status + self.log.info(msg) + self.job.appendTrace("%s|%s" % (datetime.now().ctime(), msg)) + # # Main worker function # @@ -175,57 +191,26 @@ def run(self): # Assigning job to a preallocated VM if self.preVM: # self.preVM: - self.log.debug("Assigning job to preallocated VM") self.job.vm = self.preVM self.job.updateRemote() - self.log.info("Assigned job %s:%d existing VM %s" % - (self.job.name, self.job.id, - self.vmms.instanceName(self.preVM.id, - self.preVM.name))) - self.job.appendTrace("%s|Assigned job %s:%d existing VM %s" % - (datetime.now().ctime(), - self.job.name, self.job.id, - self.vmms.instanceName(self.preVM.id, - self.preVM.name))) - self.log.debug("Assigned job to preallocated VM") + self.jobLogAndTrace("assigned VM (preallocated)", self.preVM) + # Assigning job to a new VM else: - self.log.debug("Assigning job to a new VM") self.job.vm.id = self.job.id self.job.updateRemote() - self.log.info("Assigned job %s:%d new VM %s" % - (self.job.name, self.job.id, - self.vmms.instanceName(self.job.vm.id, - self.job.vm.name))) - self.job.appendTrace( - "%s|Assigned job %s:%d new VM %s" % - (datetime.now().ctime(), - self.job.name, - self.job.id, - self.vmms.instanceName( - self.job.vm.id, - self.job.vm.name))) - # Host name returned from EC2 is stored in the vm object self.vmms.initializeVM(self.job.vm) - self.log.debug("Asigned job to a new VM") + self.jobLogAndTrace("assigned VM (just initialized)", self.job.vm) vm = self.job.vm # Wait for the instance to be ready - self.log.debug("Job %s:%d waiting for VM %s" % - (self.job.name, self.job.id, - self.vmms.instanceName(vm.id, vm.name))) - self.job.appendTrace("%s|Job %s:%d waiting for VM %s" % - (datetime.now().ctime(), - self.job.name, self.job.id, - self.vmms.instanceName(vm.id, vm.name))) - self.log.debug("Waiting for VM") + self.jobLogAndTrace("waiting for VM", vm) ret["waitvm"] = self.vmms.waitVM(vm, Config.WAITVM_TIMEOUT) - - self.log.debug("Waited for VM") + self.jobLogAndTrace("waiting for VM", vm, ret["waitvm"]) # If the instance did not become ready in a reasonable # amount of time, then reschedule the job, detach the VM, @@ -241,49 +226,29 @@ def run(self): # Thread Exit after waitVM timeout return - self.log.info("VM %s ready for job %s:%d" % - (self.vmms.instanceName(vm.id, vm.name), - self.job.name, self.job.id)) - self.job.appendTrace("%s|VM %s ready for job %s:%d" % - (datetime.now().ctime(), - self.vmms.instanceName(vm.id, vm.name), - self.job.name, self.job.id)) - # Copy input files to VM + self.jobLogAndTrace("copying to VM", vm) ret["copyin"] = self.vmms.copyIn(vm, self.job.input) if ret["copyin"] != 0: Config.copyin_errors += 1 - self.log.info("Input copied for job %s:%d [status=%d]" % - (self.job.name, self.job.id, ret["copyin"])) - self.job.appendTrace("%s|Input copied for job %s:%d [status=%d]" % - (datetime.now().ctime(), - self.job.name, - self.job.id, ret["copyin"])) + self.jobLogAndTrace("copying to VM", vm, ret["copyin"]) # Run the job on the virtual machine + self.jobLogAndTrace("running on VM", vm) ret["runjob"] = self.vmms.runJob( vm, self.job.timeout, self.job.maxOutputFileSize) + self.jobLogAndTrace("running on VM", vm, ret["runjob"]) if ret["runjob"] != 0: Config.runjob_errors += 1 if ret["runjob"] == -1: Config.runjob_timeouts += 1 - self.log.info("Job %s:%d executed [status=%s]" % - (self.job.name, self.job.id, ret["runjob"])) - self.job.appendTrace("%s|Job %s:%d executed [status=%s]" % - (datetime.now().ctime(), - self.job.name, self.job.id, - ret["runjob"])) # Copy the output back. + self.jobLogAndTrace("copying from VM", vm) ret["copyout"] = self.vmms.copyOut(vm, self.job.outputFile) + self.jobLogAndTrace("copying from VM", vm, ret["copyout"]) if ret["copyout"] != 0: Config.copyout_errors += 1 - self.log.info("Output copied for job %s:%d [status=%d]" % - (self.job.name, self.job.id, ret["copyout"])) - self.job.appendTrace("%s|Output copied for job %s:%d [status=%d]" - % (datetime.now().ctime(), - self.job.name, - self.job.id, ret["copyout"])) # Job termination. Notice that Tango considers # things like runjob timeouts and makefile errors to be @@ -312,6 +277,8 @@ def run(self): # the VM. msg = "Error: OS error while running job on VM" (returnVM, replaceVM) = (False, True) + # doNotDestroy, combined with KEEP_VM_AFTER_FAILURE, will sent + # the vm aside for further investigation after failure. self.job.vm.doNotDestroy = True self.job.vm.notes = str(self.job.id) + "_" + self.job.name else: # This should never happen From c47d8891a54f8cccef3ba4abd2938fa49c906dd1 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 14 Feb 2018 15:29:46 -0500 Subject: [PATCH 073/131] worker's job exec (copyin/run/copyout) logic flow cleanup complete. --- worker.py | 68 ++++++++++++++++++++----------------------------------- 1 file changed, 25 insertions(+), 43 deletions(-) diff --git a/worker.py b/worker.py index c23727d7..53db8a0d 100644 --- a/worker.py +++ b/worker.py @@ -120,8 +120,9 @@ def catFiles(self, f1, f2): try: with open(f2, "rb") as f2fd: shutil.copyfileobj(f2fd, wf) - except OSError: - pass + except IOError: + wf.write("NO OUTPUT FILE\n") + wf.close() os.rename(tmpname, f2) os.remove(f1) @@ -143,8 +144,8 @@ def notifyServer(self, job): except Exception as e: self.log.debug("Error in notifyServer: %s" % str(e)) - - def afterJob(hdrfile, msg, stage, rc, vmHandling): + def afterJobExecution(self, hdrfile, msg, vmHandling): + (returnVM, replaceVM) = vmHandling self.jobQueue.makeDead(self.job.id, msg) # Update the text that users see in the autodriver output file @@ -205,6 +206,7 @@ def run(self): self.jobLogAndTrace("assigned VM (just initialized)", self.job.vm) vm = self.job.vm + (returnVM, replaceVM) = (True, False) # Wait for the instance to be ready self.jobLogAndTrace("waiting for VM", vm) @@ -229,48 +231,35 @@ def run(self): # Copy input files to VM self.jobLogAndTrace("copying to VM", vm) ret["copyin"] = self.vmms.copyIn(vm, self.job.input) + self.jobLogAndTrace("copying to VM", vm, ret["copyin"]) if ret["copyin"] != 0: Config.copyin_errors += 1 - self.jobLogAndTrace("copying to VM", vm, ret["copyin"]) + msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"]) + self.afterJobExecution(hdrfile, msg, (returnVM, replaceVM)) + return # Run the job on the virtual machine self.jobLogAndTrace("running on VM", vm) ret["runjob"] = self.vmms.runJob( vm, self.job.timeout, self.job.maxOutputFileSize) self.jobLogAndTrace("running on VM", vm, ret["runjob"]) - if ret["runjob"] != 0: - Config.runjob_errors += 1 - if ret["runjob"] == -1: - Config.runjob_timeouts += 1 + # runjob may have failed. but go on with copyout to get the output if any - # Copy the output back. + # Copy the output back, even if runjob has failed self.jobLogAndTrace("copying from VM", vm) ret["copyout"] = self.vmms.copyOut(vm, self.job.outputFile) self.jobLogAndTrace("copying from VM", vm, ret["copyout"]) - if ret["copyout"] != 0: - Config.copyout_errors += 1 - - # Job termination. Notice that Tango considers - # things like runjob timeouts and makefile errors to be - # normal termination and doesn't reschedule the job. - self.log.info("Success: job %s:%d finished" % - (self.job.name, self.job.id)) - # Move the job from the live queue to the dead queue - # with an explanatory message - msg = "Success: Autodriver returned normally" - (returnVM, replaceVM) = (True, False) - if ret["copyin"] != 0: - msg = "Error: Copy in to VM failed (status=%d)" % ( - ret["copyin"]) - elif ret["runjob"] != 0: + # handle failure(s) of runjob and/or copyout. runjob error takes priority. + if ret["runjob"] != 0: + Config.runjob_errors += 1 if ret["runjob"] == 1: # This should never happen - msg = "Error: Autodriver usage error (status=%d)" % ( - ret["runjob"]) - elif ret["runjob"] == 2: - msg = "Error: Job timed out after %d seconds" % ( + msg = "Error: Autodriver usage error" + elif ret["runjob"] == -1 or ret["runjob"] == 2: # both are timeouts + Config.runjob_timeouts += 1 + msg = "Error: Job timed out. timeout setting: %d seconds" % ( self.job.timeout) - elif (ret["runjob"] == 3): # EXIT_OSERROR in Autodriver + elif ret["runjob"] == 3: # EXIT_OSERROR in Autodriver # Abnormal job termination (Autodriver encountered an OS # error). Assume that the VM is damaged. Destroy this VM # and do not retry the job since the job may have damaged @@ -284,20 +273,13 @@ def run(self): else: # This should never happen msg = "Error: Unknown autodriver error (status=%d)" % ( ret["runjob"]) - elif ret["copyout"] != 0: - msg += "Error: Copy out from VM failed (status=%d)" % ( - ret["copyout"]) - - self.jobQueue.makeDead(self.job.id, msg) - - # Update the text that users see in the autodriver output file - self.appendMsg(hdrfile, msg) - self.catFiles(hdrfile, self.job.outputFile) + Config.copyout_errors += 1 + msg += "Error: Copy out from VM failed (status=%d)" % (ret["copyout"]) + else: + msg = "Success: Autodriver returned normally" - # Thread exit after termination - self.detachVM(return_vm=returnVM, replace_vm=replaceVM) - self.notifyServer(self.job) + self.afterJobExecution(hdrfile, msg, (returnVM, replaceVM)) return # From 9565275dab5d0fa614b96b33bad642559f7714a4 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 22 Feb 2018 14:00:44 -0500 Subject: [PATCH 074/131] jobs with big ids starve after job id wraps. Fix with a timestamp. --- tangoObjects.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tangoObjects.py b/tangoObjects.py index 44c922d2..e1bb1a4d 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -5,6 +5,8 @@ import redis import pickle import Queue +import logging +from datetime import datetime from config import Config redisConnection = None @@ -99,6 +101,7 @@ def __init__(self, vm=None, self._remoteLocation = None self.accessKeyId = accessKeyId self.accessKey = accessKey + self.tm = datetime.now() def makeAssigned(self): self.syncRemote() @@ -291,6 +294,7 @@ class TangoRemoteDictionary(): def __init__(self, object_name): self.r = getRedisConnection() self.hash_name = object_name + self.log = logging.getLogger("TangoRemoteDictionary") def set(self, id, obj): pickled_obj = pickle.dumps(obj) @@ -328,8 +332,12 @@ def _clean(self): self.r.delete(self.hash_name) def iteritems(self): - return iter([(i, self.get(i)) for i in xrange(1,Config.MAX_JOBID+1) - if self.get(i) != None]) + # find all non-empty spots in the job id spectrum (actual jobs) and sort + # by the time of creation to prevent starvation of jobs with larger ids + + return iter(sorted([(i, self.get(i)) for i in xrange(1,Config.MAX_JOBID+1) + if self.get(i) != None], key=lambda x: x[1].tm)) + class TangoNativeDictionary(): @@ -356,8 +364,8 @@ def delete(self, id): del self.dict[str(id)] def iteritems(self): - return iter([(i, self.get(i)) for i in xrange(1,Config.MAX_JOBID+1) - if self.get(i) != None]) + return iter(sorted([(i, self.get(i)) for i in xrange(1,Config.MAX_JOBID+1) + if self.get(i) != None], key=lambda x: x[1].tm)) def _clean(self): # only for testing From c80a5d3362320e1f679ac4283e4ae3399da2f836 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 22 Feb 2018 15:10:50 -0500 Subject: [PATCH 075/131] Disable wrap-around of vm ids. --- jobManager.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/jobManager.py b/jobManager.py index d96e5b97..4d1294b8 100644 --- a/jobManager.py +++ b/jobManager.py @@ -52,8 +52,9 @@ def _getNextID(self): """ id = self.nextId self.nextId += 1 - if self.nextId > 99999: - self.nextId = 10000 + # xxx simply wrap the id without guarding condition is bad. disable for now. + # if self.nextId > 99999: + # self.nextId = 10000 return id def __manage(self): From 6c03416b8bcb7555d2a430a587b4a1c39248c395 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 26 Feb 2018 15:06:43 -0500 Subject: [PATCH 076/131] Move timestamping into appendTrace function and use local time. --- jobManager.py | 5 ++--- jobQueue.py | 5 ++--- tango.py | 42 ++++++++++++++++-------------------------- tangoObjects.py | 2 +- worker.py | 2 +- 5 files changed, 22 insertions(+), 34 deletions(-) diff --git a/jobManager.py b/jobManager.py index 4d1294b8..7f4e1039 100644 --- a/jobManager.py +++ b/jobManager.py @@ -106,9 +106,8 @@ def __manage(self): # Now dispatch the job to a worker self.log.info("Dispatched job %s:%d to %s [try %d]" % (job.name, job.id, preVM.name, job.retries)) - job.appendTrace( - "%s|Dispatched job %s:%d [try %d]" % - (datetime.utcnow().ctime(), job.name, job.id, job.retries)) + job.appendTrace("Dispatched job %s:%d [try %d]" % + (job.name, job.id, job.retries)) Worker( job, diff --git a/jobQueue.py b/jobQueue.py index 5b53b4f5..624f07ff 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -90,8 +90,7 @@ def add(self, job): self.log.debug("add| Acquired lock to job queue.") self.liveJobs.set(job.id, job) - job.appendTrace("%s|Added job %s:%d to queue" % - (datetime.utcnow().ctime(), job.name, job.id)) + job.appendTrace("Added job %s:%d to queue" % (job.name, job.id)) self.log.debug("Ref: " + str(job._remoteLocation)) self.log.debug("job_id: " + str(job.id)) @@ -275,7 +274,7 @@ def makeDead(self, id, reason): (job.name, job.id, reason)) self.deadJobs.set(id, job) self.liveJobs.delete(id) - job.appendTrace("%s|%s" % (datetime.utcnow().ctime(), reason)) + job.appendTrace(reason) self.queueLock.release() self.log.debug("makeDead| Released lock to job queue.") return status diff --git a/tango.py b/tango.py index fa1e6fca..f0394e01 100755 --- a/tango.py +++ b/tango.py @@ -297,30 +297,26 @@ def __validateJob(self, job, vmms): # Every job must have a name if not job.name: self.log.error("validateJob: Missing job.name") - job.appendTrace("%s|validateJob: Missing job.name" % - (datetime.utcnow().ctime())) + job.appendTrace("validateJob: Missing job.name") errors += 1 # Check the virtual machine field if not job.vm: self.log.error("validateJob: Missing job.vm") - job.appendTrace("%s|validateJob: Missing job.vm" % - (datetime.utcnow().ctime())) + job.appendTrace("validateJob: Missing job.vm") errors += 1 else: if not job.vm.image: self.log.error("validateJob: Missing job.vm.image") - job.appendTrace("%s|validateJob: Missing job.vm.image" % - (datetime.utcnow().ctime())) + job.appendTrace("validateJob: Missing job.vm.image") errors += 1 else: vobj = vmms[Config.VMMS_NAME] imgList = vobj.getImages() if job.vm.image not in imgList: - self.log.error("validateJob: Image not found: %s" % - job.vm.image) - job.appendTrace("%s|validateJob: Image not found: %s" % - (datetime.utcnow().ctime(), job.vm.image)) + self.log.error("validateJob: Image not found: %s" % job.vm.image) + + job.appendTrace("validateJob: Image not found: %s" % job.vm.image) errors += 1 else: (name, ext) = os.path.splitext(job.vm.image) @@ -328,26 +324,23 @@ def __validateJob(self, job, vmms): if not job.vm.vmms: self.log.error("validateJob: Missing job.vm.vmms") - job.appendTrace("%s|validateJob: Missing job.vm.vmms" % - (datetime.utcnow().ctime())) + job.appendTrace("validateJob: Missing job.vm.vmms") errors += 1 else: if job.vm.vmms not in vmms: self.log.error("validateJob: Invalid vmms name: %s" % job.vm.vmms) - job.appendTrace("%s|validateJob: Invalid vmms name: %s" % - (datetime.utcnow().ctime(), job.vm.vmms)) + job.appendTrace("validateJob: Invalid vmms name: %s" % job.vm.vmms) errors += 1 # Check the output file if not job.outputFile: self.log.error("validateJob: Missing job.outputFile") - job.appendTrace("%s|validateJob: Missing job.outputFile" % (datetime.utcnow().ctime())) + job.appendTrace("validateJob: Missing job.outputFile") errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): - self.log.error("validateJob: Bad output path: %s", job.outputFile) - job.appendTrace("%s|validateJob: Bad output path: %s" % - (datetime.utcnow().ctime(), job.outputFile)) + self.log.error("validateJob: Bad output path: %s" % job.outputFile) + job.appendTrace("validateJob: Bad output path: %s" % job.outputFile) errors += 1 # Check for max output file size parameter @@ -361,14 +354,12 @@ def __validateJob(self, job, vmms): for inputFile in job.input: if not inputFile.localFile: self.log.error("validateJob: Missing inputFile.localFile") - job.appendTrace("%s|validateJob: Missing inputFile.localFile" % - (datetime.utcnow().ctime())) + job.appendTrace("validateJob: Missing inputFile.localFile") errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): - self.log.error("validateJob: Bad output path: %s", job.outputFile) - job.appendTrace("%s|validateJob: Bad output path: %s" % - (datetime.utcnow().ctime(), job.outputFile)) + self.log.error("validateJob: Bad output path: %s" % job.outputFile) + job.appendTrace("validateJob: Bad output path: %s" % job.outputFile) errors += 1 if inputFile.destFile == 'Makefile': @@ -377,7 +368,7 @@ def __validateJob(self, job, vmms): # Check if input files include a Makefile if not hasMakefile: self.log.error("validateJob: Missing Makefile in input files.") - job.appendTrace("%s|validateJob: Missing Makefile in input files." % (datetime.utcnow().ctime())) + job.appendTrace("validateJob: Missing Makefile in input files.") errors+=1 # Check if job timeout has been set; If not set timeout to default @@ -389,8 +380,7 @@ def __validateJob(self, job, vmms): # Any problems, return an error status if errors > 0: self.log.error("validateJob: Job rejected: %d errors" % errors) - job.appendTrace("%s|validateJob: Job rejected: %d errors" % - (datetime.utcnow().ctime(), errors)) + job.appendTrace("validateJob: Job rejected: %d errors" % errors) return -1 else: return 0 diff --git a/tangoObjects.py b/tangoObjects.py index e1bb1a4d..a56e572a 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -120,7 +120,7 @@ def isNotAssigned(self): def appendTrace(self, trace_str): # trace attached to the object can be retrived and sent to rest api caller self.syncRemote() - self.trace.append(trace_str) + self.trace.append("%s|%s" % (datetime.now().ctime(), trace_str)) self.updateRemote() def setId(self, new_id): diff --git a/worker.py b/worker.py index 53db8a0d..b1a3271e 100644 --- a/worker.py +++ b/worker.py @@ -166,7 +166,7 @@ def jobLogAndTrace(self, stageMsg, vm, status=None): else: msg = "failed " + msg + " (status=%d)" % status self.log.info(msg) - self.job.appendTrace("%s|%s" % (datetime.now().ctime(), msg)) + self.job.appendTrace(msg) # # Main worker function From b5730f380c4071598ddf188f22efe71ad01cdd0b Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 28 Feb 2018 10:54:24 -0500 Subject: [PATCH 077/131] Remove unused function descrementPoolSize --- preallocator.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/preallocator.py b/preallocator.py index 242b56ac..249ed0ce 100644 --- a/preallocator.py +++ b/preallocator.py @@ -42,21 +42,6 @@ def freePoolSize(self, vmName): else: return 0 - def decrementPoolSize(self, vm): - """ - Called by worker to shrink the pool, after returning a vm to free pool - """ - - if not (hasattr(Config, 'POOL_SIZE_LOW_WATER_MARK') and - Config.POOL_SIZE_LOW_WATER_MARK >= 0 and vm.name in self.machines.keys()): - return - - delta = self.freePoolSize(vm.name) - Config.POOL_SIZE_LOW_WATER_MARK - if delta > 0: - self.log.info("decrementPoolSize: remove %d vms from pool %s" % (delta, vm.name)) - for i in range(delta): - threading.Thread(target=self.__destroy(vm)).start() - def incrementPoolSize(self, vm, delta): """ Called by jobQueue to create the pool and allcoate given number of vms From f8d38002dc0e9755c388d707ae78f6f526bffac8 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 28 Feb 2018 17:14:12 -0500 Subject: [PATCH 078/131] Cleanup script and add ability to terminate aws instances. --- tools/ec2Read.py | 142 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 104 insertions(+), 38 deletions(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index b80214c8..c1b1f530 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -12,9 +12,21 @@ import boto3 import pytz import tzlocal +import argparse -# test vmms.ec2SSH's image extraction code, etc -# also serve as a template of accessing the ec2SSH vmms +# Read aws instances, Tango preallocator pools, etc. +# Also serve as sample code for quick testing of Tango/VMMS functionalities. + +class CommandLine(): + def __init__(self): + parser = argparse.ArgumentParser(description='List AWS vms and preallocator pools') + parser.add_argument('-d', '--instances', metavar='instance', nargs='+', + help="destroy vms by name tags or AWS ids (can be partial). \"NoNameTag\" (case insensitive) deletes all instances without a \"Name\" tag") + self.args = parser.parse_args() + +cmdLine = CommandLine() +destroyList = cmdLine.args.instances +sortedInstances = [] local_tz = pytz.timezone("EST") def utc_to_local(utc_dt): @@ -32,11 +44,18 @@ def destroyInstances(): def utc_to_local(utc_dt): local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) - return local_tz.normalize(local_dt) + return local_dt.strftime("%Y%m%d-%H:%M:%S") + +# to test destroying instances without "Name" tag +def deleteNameTag(): + response = boto3connection.describe_instances() + for reservation in response["Reservations"]: + for instance in reservation["Instances"]: + boto3connection.delete_tags(Resources=[instance["InstanceId"]], + Tags=[{"Key": "Name"}]) -# test changing tags to keep the vm after test failure +# to test changing tags to keep the vm after test failure def changeTags(instanceId, name, notes): - return print "change tags for", instanceId instance = boto3resource.Instance(instanceId) tag = boto3resource.Tag(instanceId, "Name", name) @@ -45,35 +64,45 @@ def changeTags(instanceId, name, notes): instance.create_tags(Tags=[{"Key": "Name", "Value": "failed-" + name}]) instance.create_tags(Tags=[{"Key": "Notes", "Value": notes}]) -def listInstancesLong(): +def instanceNameTag(instance): + name = "None" + if "Tags" in instance: + for tag in instance["Tags"]: + if tag["Key"] == "Name": + name = tag["Value"] + return name + +def queryInstances(): + global sortedInstances nameInstances = [] response = boto3connection.describe_instances() for reservation in response["Reservations"]: for instance in reservation["Instances"]: if instance["State"]["Name"] != "running": continue - if "Tags" in instance: - nameTag = (item for item in instance["Tags"] if item["Key"] == "Name").next() - nameInstances.append({"Name": nameTag["Value"] if nameTag else "None", - "Instance": instance}) - else: - nameInstances.append({"Name": "None", "Instance": instance}) + nameInstances.append({"Name": instanceNameTag(instance), "Instance": instance}) sortedInstances = sorted(nameInstances, key=lambda x: x["Name"]) - # changeTags(sortedInstances[-1]["Instance"]["InstanceId"], - # sortedInstances[-1]["Name"], "test-name-xxx") - - print len(nameInstances), "instances:" - for item in sorted(nameInstances, key=lambda x: x["Name"]): - # pp = pprint.PrettyPrinter(indent=2) - # pp.pprint(instance) + print len(sortedInstances), "instances:" + +def listInstances(knownInstances=None): + global sortedInstances + instanceList = [] + if knownInstances: + instanceList = knownInstances + else: + queryInstances() + instanceList = sortedInstances + + for item in instanceList: instance = item["Instance"] launchTime = utc_to_local(instance["LaunchTime"]) print("%s: %s %s %s" % (item["Name"], instance["InstanceId"], instance["PublicIpAddress"], launchTime)) if "Tags" in instance: for tag in instance["Tags"]: - print("\t tag {%s: %s}" % (tag["Key"], tag["Value"])) + if (tag["Key"] != "Name"): + print("\t tag {%s: %s}" % (tag["Key"], tag["Value"])) else: print("\t No tags") @@ -89,13 +118,7 @@ def listInstancesLong(): print("\t tag {%s: %s}" % (tag["Key"], tag["Value"])) """ -def listInstances(): - """ - vms = ec2.getVMs() - print "aws instances", len(vms) - for vm in sorted(vms, key=lambda x: x.name): - print "vm", vm.name, vm.ec2_id - """ +def listPools(): print "pools", ec2.img2ami.keys() for key in server.preallocator.machines.keys(): pool = server.preallocator.getPool(key) @@ -112,14 +135,6 @@ def createInstances(num): vm = TangoMachine(vmms="ec2SSH", image=imageName) server.preallocVM(vm, num) -def shrinkPools(): - for imageName in pools: - (poolName, ext) = os.path.splitext(imageName) - vm = TangoMachine(vmms="ec2SSH", image=imageName) - vm.name = poolName - print "shrink pool", vm.name - server.preallocator.decrementPoolSize(vm) - def destroyRedisPools(): for key in server.preallocator.machines.keys(): print "clean up pool", key @@ -149,17 +164,68 @@ def allocateVMs(): ec2 = server.preallocator.vmms["ec2SSH"] pools = ec2.img2ami -listInstancesLong() +if destroyList: + print "Current" + listInstances() + totalTerminated = [] + + for partialStr in destroyList: + matchingInstances = [] + if partialStr.lower() == "NoNameTag".lower(): # without "Name" tag + for item in sortedInstances: + if "None" == instanceNameTag(item["Instance"]): + matchingInstances.append(item) + elif partialStr.startswith("i-"): # match instance id + for item in sortedInstances: + if item["Instance"]["InstanceId"].startswith(partialStr): + matchingInstances.append(item) + else: + for item in sortedInstances: # match a "Name" tag that is not None + if instanceNameTag(item["Instance"]).startswith(partialStr): + matchingInstances.append(item) + + # remove the items already terminated + instancesToTerminate = [] + for item in matchingInstances: + if not any(x["Instance"]["InstanceId"] == item["Instance"]["InstanceId"] for x in totalTerminated): + instancesToTerminate.append(item) + totalTerminated.append(item) + + if instancesToTerminate: + print "terminate %d instances matching query string \"%s\"" % (len(instancesToTerminate), partialStr) + listInstances(instancesToTerminate) + for item in instancesToTerminate: + boto3connection.terminate_instances(InstanceIds=[item["Instance"]["InstanceId"]]) + else: + print "no instances matching query string \"%s\"" % partialStr + # end of for loop partialStr + + print "Aftermath" + listInstances() + exit() + listInstances() +listPools() exit() + destroyInstances() destroyRedisPools() +listInstances() +listPools() +exit() + createInstances(2) -shrinkPools() +listInstances() +listPools() exit() -allocateVMs() +allocateVMs() # should see some vms disappear from free pool +listInstances() +listPools() exit() + +# resetTango will destroy all known vms that are NOT in free pool server.resetTango(server.preallocator.vmms) listInstances() +listPools() From c4b98b2ff2e540ec632eb28552c7dbf008bbae54 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 12 Mar 2018 15:51:49 -0400 Subject: [PATCH 079/131] Change boolean variable name to avoid the word "not". --- tangoObjects.py | 2 +- vmms/ec2SSH.py | 3 ++- worker.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tangoObjects.py b/tangoObjects.py index a56e572a..8f41dfa4 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -65,7 +65,7 @@ def __init__(self, name="DefaultTestVM", image=None, vmms=None, self.instance_id = id # The following attributes can instruct vmms to set the test machine # aside for further investigation. - self.doNotDestroy = False + self.keepForDebugging = False self.notes = None def __repr__(self): diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index a878f1d9..9ca654fb 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -511,8 +511,9 @@ def destroyVM(self, vm): self.log.info("destroyVM: %s %s %s %s" % (vm.ec2_id, vm.name, vm.doNotDestroy, vm.notes)) + # Keep the vm and mark with meaningful tags for debugging if hasattr(config.Config, 'KEEP_VM_AFTER_FAILURE') and \ - config.Config.KEEP_VM_AFTER_FAILURE and vm.doNotDestroy: + config.Config.KEEP_VM_AFTER_FAILURE and vm.keepForDebugging: iName = self.instanceName(vm.id, vm.name) self.log.info("Will keep VM %s for further debugging" % iName) instance = self.boto3resource.Instance(vm.ec2_id) diff --git a/worker.py b/worker.py index b1a3271e..ba0bc9d2 100644 --- a/worker.py +++ b/worker.py @@ -268,7 +268,7 @@ def run(self): (returnVM, replaceVM) = (False, True) # doNotDestroy, combined with KEEP_VM_AFTER_FAILURE, will sent # the vm aside for further investigation after failure. - self.job.vm.doNotDestroy = True + self.job.vm.keepForDebugging = True self.job.vm.notes = str(self.job.id) + "_" + self.job.name else: # This should never happen msg = "Error: Unknown autodriver error (status=%d)" % ( From 72c29d70014fa10cdb9ae77e4316483c596996af Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 12 Mar 2018 16:20:58 -0400 Subject: [PATCH 080/131] Remove reference to aws auto scaling group, also fix a problem with the last commit. --- vmms/ec2SSH.py | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 9ca654fb..4b896bc3 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -148,25 +148,6 @@ def __init__(self, accessKeyId=None, accessKey=None): if (len(ignoredAmis) > 0): self.log.info("Ignored amis %s due to lack of proper name tag" % str(ignoredAmis)) - # preliminary code for auto scaling group (configured by EC2_AUTO_SCALING_GROUP_NAME) - # Here we get the pointer to the group, if any. - # When an instance is created, it's attached to the group. - # When an instance is terminated, it's detached. - self.asg = None - self.auto_scaling_group = None - self.auto_scaling_group_name = None - if hasattr(config.Config, 'EC2_AUTO_SCALING_GROUP_NAME') and config.Config.EC2_AUTO_SCALING_GROUP_NAME: - self.asg = boto3.client("autoscaling", config.Config.EC2_REGION) - groups = self.asg.describe_auto_scaling_groups(AutoScalingGroupNames=[config.Config.EC2_AUTO_SCALING_GROUP_NAME]) - if len(groups['AutoScalingGroups']) == 1: - self.auto_scaling_group = groups['AutoScalingGroups'][0] - self.auto_scaling_group_name = config.Config.EC2_AUTO_SCALING_GROUP_NAME - self.log.info("Use aws auto scaling group %s" % self.auto_scaling_group_name) - - instances = self.asg.describe_auto_scaling_instances()['AutoScalingInstances'] - else: - self.log.info("Cannot find auto scaling group %s" % config.Config.EC2_AUTO_SCALING_GROUP_NAME) - def instanceName(self, id, name): """ instanceName - Constructs a VM instance name. Always use this function when you need a VM instance name. Never generate @@ -331,11 +312,6 @@ def initializeVM(self, vm): newInstance.public_dns_name, newInstance.ip_address)) - if self.auto_scaling_group: - self.asg.attach_instances(InstanceIds=[newInstance.id], - AutoScalingGroupName=self.auto_scaling_group_name) - self.log.info("attach new instance %s to auto scaling group" % newInstance.id) - # Save domain and id ssigned by EC2 in vm object vm.domain_name = newInstance.ip_address vm.ec2_id = newInstance.id @@ -509,7 +485,7 @@ def destroyVM(self, vm): self.log.info("destroyVM: instance non-exist %s %s" % (vm.ec2_id, vm.name)) return [] - self.log.info("destroyVM: %s %s %s %s" % (vm.ec2_id, vm.name, vm.doNotDestroy, vm.notes)) + self.log.info("destroyVM: %s %s %s %s" % (vm.ec2_id, vm.name, vm.keepForDebugging, vm.notes)) # Keep the vm and mark with meaningful tags for debugging if hasattr(config.Config, 'KEEP_VM_AFTER_FAILURE') and \ @@ -531,17 +507,6 @@ def destroyVM(self, vm): if not self.useDefaultKeyPair: self.deleteKeyPair() - if self.auto_scaling_group: - response = self.asg.describe_auto_scaling_instances(InstanceIds=[vm.ec2_id], - MaxRecords=1) - if len(response['AutoScalingInstances']) == 1: - self.asg.detach_instances(InstanceIds=[vm.ec2_id], - AutoScalingGroupName=self.auto_scaling_group_name, - ShouldDecrementDesiredCapacity=True) - self.log.info("detach instance %s %s from auto scaling group" % (vm.ec2_id, vm.name)) - else: - self.log.info("instance %s %s not in auto scaling group" % (vm.ec2_id, vm.name)) - return ret def safeDestroyVM(self, vm): From ae1751e5015234c8a9ac8f197f1d1071313672d3 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 12 Mar 2018 17:46:04 -0400 Subject: [PATCH 081/131] make autodriver config variables optional for backward compatibility. --- vmms/ec2SSH.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 4b896bc3..b627f3d7 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -418,15 +418,22 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): domain_name = self.domainName(vm) self.log.debug("runJob: Running job on VM %s" % self.instanceName(vm.id, vm.name)) - # Setting ulimits for VM and running job - runcmd = "/usr/bin/time --output=time.out autodriver -u %d -f %d -t \ - %d -o %d -z %s -i %d autolab &> output" % ( - config.Config.VM_ULIMIT_USER_PROC, - config.Config.VM_ULIMIT_FILE_SIZE, - runTimeout, - maxOutputFileSize, - config.Config.AUTODRIVER_LOGGING_TIME_ZONE, - config.Config.AUTODRIVER_TIMESTAMP_INTERVAL) + + # Setting arguments for VM and running job + runcmd = "/usr/bin/time --output=time.out autodriver \ + -u %d -f %d -t %d -o %d " % ( + config.Config.VM_ULIMIT_USER_PROC, + config.Config.VM_ULIMIT_FILE_SIZE, + runTimeout, + maxOutputFileSize) + if hasattr(config.Config, 'AUTODRIVER_LOGGING_TIME_ZONE') and \ + config.Config.AUTODRIVER_LOGGING_TIME_ZONE: + runcmd = runcmd + ("-z %s " % config.Config.AUTODRIVER_LOGGING_TIME_ZONE) + if hasattr(config.Config, 'AUTODRIVER_TIMESTAMP_INTERVAL') and \ + config.Config.AUTODRIVER_TIMESTAMP_INTERVAL: + runcmd = runcmd + ("-i %d " % config.Config.AUTODRIVER_TIMESTAMP_INTERVAL) + runcmd = runcmd + "autolab &> output" + # runTimeout * 2 is a conservative estimate. # autodriver handles timeout on the target vm. ret = timeout(["ssh"] + self.ssh_flags + From 2c1eac80b1b555d52d1c346ee408d8c1f8758941 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 12 Mar 2018 17:49:01 -0400 Subject: [PATCH 082/131] Add missing config variable and comments. --- config.template.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/config.template.py b/config.template.py index 5f72fe1c..6075c54f 100644 --- a/config.template.py +++ b/config.template.py @@ -23,6 +23,7 @@ class Config: PORT = 3000 # Log file. Setting this to None sends the server output to stdout + # Strongly suggest setting up a log file LOGFILE = None # Logging level @@ -58,7 +59,7 @@ class Config: # We have the option to reuse VMs or discard them after each use # xxxXXX??? strongly suspect the code path for the False case - # not working, after a failed experiment. + # not working, after a failed experiment. -- czang@cmu.edu REUSE_VMS = True # Worker waits this many seconds for functions waitvm, copyin (per @@ -70,6 +71,7 @@ class Config: COPYOUT_TIMEOUT = 30 # time zone and timestamp report interval for autodriver execution + # both are optional. AUTODRIVER_LOGGING_TIME_ZONE = "UTC" # e.g. "America/New_York". AUTODRIVER_TIMESTAMP_INTERVAL = 0 # in seconds. 0 => no timestamp insersion @@ -106,6 +108,9 @@ class Config: # Give VMMS this many seconds to destroy a VM before giving up DESTROY_SECS = 5 + # When set to True, put the vm aside for debugging after OS ERROR by autodriver + KEEP_VM_AFTER_FAILURE = None + # Time to wait between creating VM instances to give DNS time to cool down CREATEVM_SECS = 1 @@ -160,7 +165,6 @@ class Config: EC2_REGION = '' EC2_USER_NAME = '' - EC2_AUTO_SCALING_GROUP_NAME = None # or the name of the auto scaling group DEFAULT_INST_TYPE = '' DEFAULT_SECURITY_GROUP = '' SECURITY_KEY_PATH = '' From c8fafd0a268fddc0423d601785f5515e7494d250 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 12 Mar 2018 17:50:44 -0400 Subject: [PATCH 083/131] Add a couple of git ignore files. --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index be2afa23..ec6361f7 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,9 @@ vmms/id_rsa* courselabs/* # config config.py +output_gen +.gitignore + # Virtualenv .Python From fcc8777708dcb45ad38357de74689496ee272143 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 20 Mar 2018 14:10:39 -0400 Subject: [PATCH 084/131] Better comments for run_job scripts. --- tools/config_for_run_jobs.py | 10 +++++----- tools/run_jobs.py | 10 ++++++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py index 2fa03a77..e94d3232 100644 --- a/tools/config_for_run_jobs.py +++ b/tools/config_for_run_jobs.py @@ -16,9 +16,9 @@ class Config: # YOUR lab definitions. The index of the lab is given to run_job.py labs = [ - {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "746.img"}, - {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "newPool.img"}, - {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "newPool.img"}] + {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "course-746.img"}, + {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "course-213.img"}, + {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "my-exp.img"}] # Range of student submissions to run (sorted by student emails) # If either is None, all student submissions are run, unless @@ -39,9 +39,9 @@ class Config: # IP of the tango container is usually computed automatically tangoIP = "" - # Redis port. Sometimes we have two redis running, each support a Tango + # Redis port. Sometimes we have two redis running, each support a Tango instance. # In such case a different forwarding port is assigned to it. - # Note: This variable is used by ec2Read.py only. + # Note: This variable is used by tools/ec2Read.py only. redisPort = 6379 # standard # end of class Config diff --git a/tools/run_jobs.py b/tools/run_jobs.py index ce35c39a..97881110 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -6,10 +6,12 @@ from util import Lab import util -# drive student submissions to Tango. See ./util.py for preset configuratons. -# the script finds course and labs at a specified location and submits work -# from the handin directory. -# It then waits for all output files to have newer modification time. +# Drive exiting student submissions to Tango. +# Find course/lab at specified location and submits work from the handin directory. +# Then wait for job output files. +# +# Use -h to show usage. +# See config_for_run_jobs.py for configuration options. cfg = Config() cmdLine = CommandLine(cfg) From c58c87b47d98ccda763c6cdc85482cd1fe4598fa Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 9 Apr 2018 16:42:09 -0400 Subject: [PATCH 085/131] ec2Read can delete vms with name tag matching at the beginning or the end. --- tools/ec2Read.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index c1b1f530..bc17db99 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -181,7 +181,8 @@ def allocateVMs(): matchingInstances.append(item) else: for item in sortedInstances: # match a "Name" tag that is not None - if instanceNameTag(item["Instance"]).startswith(partialStr): + if instanceNameTag(item["Instance"]).startswith(partialStr) or \ + instanceNameTag(item["Instance"]).endswith(partialStr): matchingInstances.append(item) # remove the items already terminated From 70c1ed1705a037b6fa6bd3eb67f6e793e2099ee7 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 17 Apr 2018 16:25:56 -0400 Subject: [PATCH 086/131] more comments, isolate timestamp insertion for easy understanding of code. --- autodriver/autodriver.c | 135 ++++++++++++++++++++++++++-------------- 1 file changed, 89 insertions(+), 46 deletions(-) diff --git a/autodriver/autodriver.c b/autodriver/autodriver.c index 75d26e33..71209459 100644 --- a/autodriver/autodriver.c +++ b/autodriver/autodriver.c @@ -39,6 +39,26 @@ #include #include +// How autodriver works: +// +// The parent process creates an output file and starts a child process run_job(). +// The child process assumes under the home directory of the user "autograde" +// there is a directory specified on the command line of this program. +// Under that directory, there is a Makefile. +// The child will run the Makefile to start the tests and redirects all output +// to the output file created by the parent process. +// +// After the child process terminates, the parent parses the output file and +// sends the content to stdout, in dump_output() and dump_file(). If the +// output file is too large, it's elided in the middle. If timestamp +// option (-i) is specified, timestamps are inserted into the output stream. +// +// If timestamp option is set: The parent starts a thread timestampFunc() after +// starting the child process. The thread records at the given interval the +// timestamps (output file size AND time). While parsing the output file +// after the child process, the recorded timestamps are inserted at the offsets +// by insertTimestamp(). + #define min(x, y) ((x) < (y) ? (x) : (y)) char timestampStr[100]; @@ -187,7 +207,8 @@ static int parse_user(char *name, struct passwd *user_info, char **buf) { return 0; } -// pthread function, keep a map of timestamp and user's output file offset +// pthread function, keep a map of timestamp and user's output file offset. +// The thread is not started unless timestamp interval option is specified. void *timestampFunc() { time_t lastStamp = 0; int lastJumpIndex = -1; @@ -273,8 +294,60 @@ int writeBuffer(char *buffer, size_t nBytes) { // nBytes can be zero (no-op) return 0; } -#define WRITE_BUFFER(buffer, nBytes) \ - if (writeBuffer(buffer, nBytes)) return -1 +// Insert the timestamp at the appropriate places. +// When failing to write to the output file, return with updated scanCursor, +void insertTimestamp(char *buffer, + size_t bufferOffset, + size_t bufferLength, + char **scanCursorInOut, + unsigned *currentStampInOut) { + char *scanCursor = *scanCursorInOut; + unsigned currentStamp = *currentStampInOut; + size_t nextOffset = bufferOffset + bufferLength; + size_t eolOffset = 0; + + // pace through timestamps that fall into the buffer + while (currentStamp < timestampCount && + timestampMap[currentStamp].offset < nextOffset) { + + // there might be unused timestamps from last read buffer or before last eol. + // skip over them. + if (timestampMap[currentStamp].offset < bufferOffset || + timestampMap[currentStamp].offset <= eolOffset) { + currentStamp++; + continue; + } + + char *eolSearchStart = timestampMap[currentStamp].offset - bufferOffset + buffer; + char *nextEol = strchr(eolSearchStart, '\n'); + if (!nextEol) { // no line break found in read buffer to insert timestamp + break; + } + + + // write the stuff up to the line break + if (writeBuffer(scanCursor, nextEol - scanCursor + 1)) {break;} + scanCursor = nextEol + 1; + + // no timestamp at EOF, because the test scores are on the last line + eolOffset = bufferOffset + (nextEol - buffer); + if (eolOffset + 1 >= outputFileSize) { + break; + } + + // write the timestamp + char stampInsert[200]; + sprintf(stampInsert, + "...[timestamp %s inserted by autodriver at offset ~%lu. Maybe out of sync with output's own timestamps.]...\n", + getTimestamp(timestampMap[currentStamp].time), + timestampMap[currentStamp].offset); + if (writeBuffer(stampInsert, strlen(stampInsert))) {break;} + currentStamp++; + } // while loop through the stamps falling into read buffer's range + + *scanCursorInOut = scanCursor; + *currentStampInOut = currentStamp; +} /** * @brief Dumps a specified number of bytes from a file to standard out @@ -291,8 +364,9 @@ static int dump_file(int fd, size_t bytes, off_t offset) { size_t nextOffset = offset; if (offset) { // second part of output file, after truncating in the middle - char *msg = "\n...[excess bytes elided by autodriver]...\n"; - WRITE_BUFFER(msg, strlen(msg)); + // insert a message, indicating file truncation + char *msg = "\n...[excess bytes elided by autodriver]...\n"; + if (writeBuffer(msg, strlen(msg))) {return -1;} } // Flush stdout so our writes here don't race with buffer flushes @@ -309,7 +383,6 @@ static int dump_file(int fd, size_t bytes, off_t offset) { while (read_rem > 0) { char buffer[BUFSIZE]; ssize_t nread; - size_t bufferOffset = nextOffset; memset(buffer, 0, BUFSIZE); if ((nread = read(fd, buffer, min(read_rem, BUFSIZE))) < 0) { @@ -317,48 +390,15 @@ static int dump_file(int fd, size_t bytes, off_t offset) { return -1; } read_rem -= nread; - nextOffset += nread; // offset of currently read buffer in the file char *scanCursor = buffer; - size_t eolOffset = 0; - - // pace through timestamps that fall into the buffer - while (currentStamp < timestampCount && - timestampMap[currentStamp].offset < nextOffset) { - - // there might be unused timestamps from last read buffer or before last eol - if (timestampMap[currentStamp].offset < bufferOffset || - timestampMap[currentStamp].offset <= eolOffset) { - currentStamp++; - continue; - } - - char *eolSearchStart = timestampMap[currentStamp].offset - bufferOffset + buffer; - char *nextEol = strchr(eolSearchStart, '\n'); - if (!nextEol) { // no line break found in read buffer to insert timestamp - break; - } - // write the stuff up to the line break - WRITE_BUFFER(scanCursor, nextEol - scanCursor + 1); // write up to \n - scanCursor = nextEol + 1; + if (timestampCount) { // If inserting timestamp + insertTimestamp(buffer, nextOffset, nread, &scanCursor, ¤tStamp); + } - // no timestamp at EOF, because the test scores are on the last line - eolOffset = bufferOffset + (nextEol - buffer); - if (eolOffset + 1 >= outputFileSize) { - break; - } + if (writeBuffer(scanCursor, nread - (scanCursor - buffer))) {return -1;} - // write the timestamp - char stampInsert[200]; - sprintf(stampInsert, - "...[timestamp %s inserted by autodriver at offset ~%lu. Maybe out of sync with output's own timestamps.]...\n", - getTimestamp(timestampMap[currentStamp].time), - timestampMap[currentStamp].offset); - WRITE_BUFFER(stampInsert, strlen(stampInsert)); - currentStamp++; - } // while loop through the stamps falling into read buffer's range - - WRITE_BUFFER(scanCursor, nread - (scanCursor - buffer)); + nextOffset += nread; // offset of next read buffer in the file } // while loop finish reading return 0; @@ -586,7 +626,7 @@ static int monitor_child(pid_t child) { int killed = 0; int status; - // create a thread for for file size tracking by time interval + // create a thread to track the file size at given time interval pthread_t timestampThread = 0; // this thread needs no cancellation if (args.timestamp_interval > 0) { if (pthread_create(×tampThread, NULL, timestampFunc, NULL)) { @@ -607,7 +647,7 @@ static int monitor_child(pid_t child) { if (sigtimedwait(&sigset, NULL, &timeout) < 0) { // Child timed out - ERROR_ERRNO("Job timed out after %d seconds\n", args.timeout); + ERROR("Job timed out after %d seconds", args.timeout); assert(errno == EAGAIN); kill(child, SIGKILL); killed = 1; @@ -787,6 +827,9 @@ int main(int argc, char **argv) { sigaddset(&sigset, SIGCHLD); sigprocmask(SIG_BLOCK, &sigset, NULL); + // output file is written by the child process while running the test. + // It's created here before forking, because the timestamp thread needs + // read access to it. if ((child_output_fd = open(OUTPUT_FILE, O_WRONLY | O_CREAT | O_TRUNC | O_SYNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) < 0) { ERROR_ERRNO("Creating output file"); From d27fb52b4f3c486d5bffc4042498b53f2d9d7e90 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 18 Apr 2018 16:28:59 -0400 Subject: [PATCH 087/131] Add info about the HostPort config variables. --- tools/config_for_run_jobs.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py index e94d3232..240a6d0f 100644 --- a/tools/config_for_run_jobs.py +++ b/tools/config_for_run_jobs.py @@ -32,16 +32,29 @@ class Config: # YOUR Tango repo root (cloned from xyzisinus' Autolab github) tangoDir = "/h/myname/Tango" - # Sometimes multiple experimental Tango containers are run on one machine. - # They are identified by different ports. - tangoHostPort = "host-port 8600" - # IP of the tango container is usually computed automatically tangoIP = "" - # Redis port. Sometimes we have two redis running, each support a Tango instance. - # In such case a different forwarding port is assigned to it. - # Note: This variable is used by tools/ec2Read.py only. - redisPort = 6379 # standard + # INFO: Where tango and redis ports are defined + # In docker-compose.yml file (under parent dir of Tango), there can be: + ''' + tango: + ports: + - '8600:8600' + - '6380:6379' + ''' + # The first port pair is for tango. The port before ":" is on the host and + # the other (optional) inside the container if tango/redis are run in a + # container. The second line is for redis. + # Sometimes we run multiple tango/redis containers on the same host for + # separate experiments. To access different tango/redis, we can give them + # different on-host port numbers, hence the need for the HostPort variables. + # A util script can reach the desirable entity using those varialbes. + + # Note: This variable is used by tools/util.py (run_jobs.py) only so far. + tangoHostPort = "host-port 8600" + + # Note: This variable is used by tools/ec2Read.py only so far. + redisHostPort = 6379 # default # end of class Config From c2e1a6140de8849d426c911e1735b2b5b86b843c Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 24 Apr 2018 00:00:22 -0400 Subject: [PATCH 088/131] Better error report in case of write failure. --- autodriver/autodriver.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/autodriver/autodriver.c b/autodriver/autodriver.c index 71209459..671d90f3 100644 --- a/autodriver/autodriver.c +++ b/autodriver/autodriver.c @@ -286,6 +286,7 @@ int writeBuffer(char *buffer, size_t nBytes) { // nBytes can be zero (no-op) while (write_rem > 0) { if ((nwritten = write(STDOUT_FILENO, write_base, write_rem)) < 0) { ERROR_ERRNO("Writing output"); + ERROR("Failure details: write_base %p write_rem %lu", write_base, write_rem); return -1; } write_rem -= nwritten; @@ -318,15 +319,17 @@ void insertTimestamp(char *buffer, continue; } - char *eolSearchStart = timestampMap[currentStamp].offset - bufferOffset + buffer; + char *eolSearchStart = buffer + (timestampMap[currentStamp].offset - bufferOffset); char *nextEol = strchr(eolSearchStart, '\n'); if (!nextEol) { // no line break found in read buffer to insert timestamp break; } - // write the stuff up to the line break - if (writeBuffer(scanCursor, nextEol - scanCursor + 1)) {break;} + if (writeBuffer(scanCursor, (nextEol + 1) - scanCursor)) { + ERROR("Write failed: buffer %p cursor %p nextEol %p", buffer, scanCursor, nextEol); + break; + } scanCursor = nextEol + 1; // no timestamp at EOF, because the test scores are on the last line @@ -336,7 +339,7 @@ void insertTimestamp(char *buffer, } // write the timestamp - char stampInsert[200]; + char stampInsert[300]; sprintf(stampInsert, "...[timestamp %s inserted by autodriver at offset ~%lu. Maybe out of sync with output's own timestamps.]...\n", getTimestamp(timestampMap[currentStamp].time), @@ -396,7 +399,10 @@ static int dump_file(int fd, size_t bytes, off_t offset) { insertTimestamp(buffer, nextOffset, nread, &scanCursor, ¤tStamp); } - if (writeBuffer(scanCursor, nread - (scanCursor - buffer))) {return -1;} + if (writeBuffer(scanCursor, nread - (scanCursor - buffer))) { + ERROR("Write failed: buffer %p cursor %p nread %lu", buffer, scanCursor, nread); + return -1; + } nextOffset += nread; // offset of next read buffer in the file } // while loop finish reading @@ -670,6 +676,7 @@ static int monitor_child(pid_t child) { MESSAGE("Timestamps inserted at %d-second or larger intervals, depending on output rates", args.timestamp_interval); } + MESSAGE("Also check end of output for potential errors"); childFinished = 1; dump_output(); From b8801609f1027ecf3790c37daa9b04a146cc427c Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 26 Apr 2018 13:22:25 -0400 Subject: [PATCH 089/131] Change redis port name ec2Read to match the config variable. --- tools/config_for_run_jobs.py | 19 +++++++++++++++---- tools/ec2Read.py | 2 +- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py index 240a6d0f..ed03329d 100644 --- a/tools/config_for_run_jobs.py +++ b/tools/config_for_run_jobs.py @@ -10,27 +10,37 @@ class Config: # YOUR course name course = "your-name-experiment" + course = "czang-exp" # YOUR root dir for course/lab definitions and handin (student submissions) courseRoot = "/n/scratch/czang/f16/" + #courseRoot = "/n/scratch/czang/f17/" + courseRoot = "/mnt/data/f16/" # YOUR lab definitions. The index of the lab is given to run_job.py labs = [ - {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "course-746.img"}, - {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "course-213.img"}, - {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "my-exp.img"}] + {"name": "cloudfscheckpoint2dedup", "handinSuffix": ".tar", "image": "penndot.img"}, + {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "penndot.img"}, + {"name": "myftlcheckpoint2", "handinSuffix": ".cpp", "image": "746.img"}, + {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "746.img"}, + {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "xyz.img"}, + {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "xyz.img"}, + {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "xyz.img"}] # Range of student submissions to run (sorted by student emails) # If either is None, all student submissions are run, unless # -r, -f, or -s is given to run_jobs. firstStudentNum = 3 # start from index 3 (set to None for all students) - totalStudents = 1 # run one student + totalStudents = 1 # number of students to submit + firstStudentNum = None # set to None for all students + # YOUR Tango container's root dir for submissions and output tangoFileRoot = "/root/autolab-oneclick/server/tango_courselabs" # YOUR Tango repo root (cloned from xyzisinus' Autolab github) tangoDir = "/h/myname/Tango" + tangoDir = "/root/autolab-oneclick/server/Tango" # IP of the tango container is usually computed automatically tangoIP = "" @@ -56,5 +66,6 @@ class Config: # Note: This variable is used by tools/ec2Read.py only so far. redisHostPort = 6379 # default + redisHostPort = 6380 # end of class Config diff --git a/tools/ec2Read.py b/tools/ec2Read.py index bc17db99..c1aecf4f 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -155,7 +155,7 @@ def allocateVMs(): # is defined in config_for_run_jobs.py. To select the redis server, # We get the connection here and pass it into tangoObjects redisConnection = redis.StrictRedis( - host=Config.REDIS_HOSTNAME, port=config_for_run_jobs.Config.redisPort, db=0) + host=Config.REDIS_HOSTNAME, port=config_for_run_jobs.Config.redisHostPort, db=0) tangoObjects.getRedisConnection(connection=redisConnection) boto3connection = boto3.client("ec2", Config.EC2_REGION) boto3resource = boto3.resource("ec2", Config.EC2_REGION) From c1aa0ce4139b6e4e7aa119d18323ffde3a650221 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 14 May 2018 13:26:37 -0400 Subject: [PATCH 090/131] Fix a buffer over-read problem in audodriver.c. --- autodriver/autodriver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autodriver/autodriver.c b/autodriver/autodriver.c index 671d90f3..3dea9cb3 100644 --- a/autodriver/autodriver.c +++ b/autodriver/autodriver.c @@ -384,10 +384,10 @@ static int dump_file(int fd, size_t bytes, off_t offset) { } while (read_rem > 0) { - char buffer[BUFSIZE]; + char buffer[BUFSIZE + 1]; // keep the last byte as string terminator ssize_t nread; - memset(buffer, 0, BUFSIZE); + memset(buffer, 0, BUFSIZE + 1); if ((nread = read(fd, buffer, min(read_rem, BUFSIZE))) < 0) { ERROR_ERRNO("Reading from output file"); return -1; From 7580ccf7247aaa6101b4cd8ee9affa59b21503c4 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 14 May 2018 15:38:05 -0400 Subject: [PATCH 091/131] Tougher autodriver unit test -- random length long lines. --- autodriver/test/output_gen.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/autodriver/test/output_gen.c b/autodriver/test/output_gen.c index 19e3e98f..abd81943 100644 --- a/autodriver/test/output_gen.c +++ b/autodriver/test/output_gen.c @@ -4,8 +4,10 @@ #include #include #include +#include int main() { + srand((unsigned)time(NULL)); putenv("TZ=America/New_York"); tzset(); @@ -16,12 +18,19 @@ int main() { time_t ltime = time(NULL); struct tm* tmInfo = localtime(<ime); strftime(timeStr, 100, "%Y%m%d-%H:%M:%S", tmInfo); - printf("TIME: \"%s\"\n", timeStr); + printf("TIME: \"%s\" followed by 3 lines of random lenth\n", timeStr); int j; - for (j = 0; j < 10; j++) { - printf("=%1d-0123456789", j); + for (j = 0; j < 3; j++) { + int lineLength = rand() % 2000; // longer than autodriver's buf size + int count = 0; + char line[81]; + memset(line, 0, 81); + while (count < lineLength) { + line[count] = '0' + count % 10; + count++; + } + printf("%s\n", line); } - printf("\n"); } sleep(1); } From 9c9b5e509326effab254f6eb9358a4475f07ef3b Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 12 Jun 2018 17:41:43 -0400 Subject: [PATCH 092/131] Add -l (list instances and pools) and -e (empty pools) to ec2Read. --- tools/ec2Read.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index c1aecf4f..2fa71803 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -22,10 +22,14 @@ def __init__(self): parser = argparse.ArgumentParser(description='List AWS vms and preallocator pools') parser.add_argument('-d', '--instances', metavar='instance', nargs='+', help="destroy vms by name tags or AWS ids (can be partial). \"NoNameTag\" (case insensitive) deletes all instances without a \"Name\" tag") + parser.add_argument('-l', '--list', action='store_true', dest='listVMs', help="list vms") + parser.add_argument('-e', '--emptyPools', action='store_true', dest='emptyPools', help="empty redis pools") self.args = parser.parse_args() cmdLine = CommandLine() destroyList = cmdLine.args.instances +listVMs = cmdLine.args.listVMs +emptyPools = cmdLine.args.emptyPools sortedInstances = [] local_tz = pytz.timezone("EST") @@ -205,6 +209,18 @@ def allocateVMs(): listInstances() exit() +if listVMs: + listInstances() + listPools() + exit() + +if emptyPools: + destroyRedisPools() + exit() + +listInstances() +listPools() +createInstances(1) listInstances() listPools() exit() From 4cd4f0a70b5a01108b6bd1b4d9abffa27e3a8243 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 22 Jun 2018 15:39:08 -0400 Subject: [PATCH 093/131] First round of converting all ec2 code to using boto3 --- vmms/ec2SSH.py | 164 ++++++++++++++++++++++++------------------------- 1 file changed, 82 insertions(+), 82 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index b627f3d7..7693d361 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -16,14 +16,13 @@ import config -import boto -from boto import ec2 import boto3 +from boto3 import ec2 +from botocore.exceptions import ClientError from tangoObjects import TangoMachine ### added to suppress boto XML output -- Jason Boles -logging.getLogger('boto').setLevel(logging.CRITICAL) logging.getLogger('boto3').setLevel(logging.CRITICAL) logging.getLogger('botocore').setLevel(logging.CRITICAL) @@ -105,16 +104,11 @@ def __init__(self, accessKeyId=None, accessKey=None): self.log.info("init Ec2SSH") self.ssh_flags = Ec2SSH._SSH_FLAGS - if accessKeyId: - self.connection = ec2.connect_to_region(config.Config.EC2_REGION, - aws_access_key_id=accessKeyId, aws_secret_access_key=accessKey) - self.useDefaultKeyPair = False - else: - self.connection = ec2.connect_to_region(config.Config.EC2_REGION) - self.useDefaultKeyPair = True - self.boto3connection = boto3.client("ec2", config.Config.EC2_REGION) - self.boto3resource = boto3.resource("ec2", config.Config.EC2_REGION) + self.useDefaultKeyPair = False if accessKeyId else True + self.boto3connection = boto3.client("ec2", config.Config.EC2_REGION, + aws_access_key_id=accessKeyId, aws_secret_access_key=accessKey) + self.connection = self.boto3resource = boto3.resource("ec2", config.Config.EC2_REGION) # Use boto3 to read images. Find the "Name" tag and use it as key to # build a map from "Name tag" to boto3's image structure. @@ -220,21 +214,15 @@ def deleteKeyPair(self): def createSecurityGroup(self): # Create may-exist security group try: - security_group = self.connection.create_security_group( - config.Config.DEFAULT_SECURITY_GROUP, - "Autolab security group - allowing all traffic") - # All ports, all traffics, all ips - security_group.authorize(from_port=None, - to_port=None, ip_protocol='-1', cidr_ip='0.0.0.0/0') - except boto.exception.EC2ResponseError: + response = self.connection.create_security_group( + GroupName=config.Config.DEFAULT_SECURITY_GROUP, + Description="Autolab security group - allowing all traffic") + security_group_id = response['GroupId'] + self.connection.authorize_security_group_ingress( + GroupId=security_group_id) + except ClientError as e: pass - def getInstanceByReservationId(self, reservationId): - for inst in self.connection.get_all_instances(): - if inst.id == reservationId: - return inst.instances.pop() - return None - # # VMMS API functions # @@ -258,62 +246,65 @@ def initializeVM(self, vm): self.key_pair_name = self.keyPairName(vm.id, vm.name) self.createKeyPair() - reservation = self.connection.run_instances( - ec2instance['ami'], - key_name=self.key_pair_name, - security_groups=[ - config.Config.DEFAULT_SECURITY_GROUP], - instance_type=ec2instance['instance_type']) + reservation = self.connection.create_instances(ImageId=ec2instance['ami'], + InstanceType=ec2instance['instance_type'], + KeyName=self.key_pair_name, + SecurityGroups=[ + config.Config.DEFAULT_SECURITY_GROUP], + MaxCount=1, + MinCount=1) # Sleep for a while to prevent random transient errors observed # when the instance is not available yet time.sleep(config.Config.TIMER_POLL_INTERVAL) - newInstance = self.getInstanceByReservationId(reservation.id) + newInstance = reservation[0] if newInstance: # Assign name to EC2 instance - self.connection.create_tags([newInstance.id], {"Name": instanceName}) - self.log.info("new instance created %s" % newInstance) + self.connection.create_tags(Resources=[newInstance.id], + Tags=[{"Key": "Name", "Value": instanceName}]) + self.log.info("new instance %s created with name tag %s" % + (newInstance.id, instanceName)) else: raise ValueError("cannot find new instance for %s" % instanceName) # Wait for instance to reach 'running' state start_time = time.time() while True: - elapsed_secs = time.time() - start_time + # Note: You'd think we should be able to read the state from the + # instance but that turns out not working. So we round up all + # running intances and find our instance by instance id - newInstance = self.getInstanceByReservationId(reservation.id) - if not newInstance: - raise ValueError("cannot obtain aws instance for %s" % instanceName) + filters=[{'Name': 'instance-state-name', 'Values': ['running']}] + instances = self.connection.instances.filter(Filters=filters) + instanceRunning = False - if newInstance.state == "pending": - if elapsed_secs > config.Config.INITIALIZEVM_TIMEOUT: - raise ValueError("VM %s: timeout (%d seconds) before reaching 'running' state" % - (instanceName, config.Config.TIMER_POLL_INTERVAL)) + newInstance.load() # reload the state of the instance + for inst in instances.filter(InstanceIds=[newInstance.id]): + self.log.debug("VM %s: is running %s" % (instanceName, newInstance.id)) + instanceRunning = True - self.log.debug("VM %s: Waiting to reach 'running' from 'pending'" % instanceName) - time.sleep(config.Config.TIMER_POLL_INTERVAL) - continue + if instanceRunning: + break - if newInstance.state == "running": - self.log.debug("VM %s: has reached 'running' state in %d seconds" % - (instanceName, elapsed_secs)) - break + if time.time() - start_time > config.Config.INITIALIZEVM_TIMEOUT: + raise ValueError("VM %s: timeout (%d seconds) before reaching 'running' state" % + (instanceName, config.Config.TIMER_POLL_INTERVAL)) - raise ValueError("VM %s: quit waiting when seeing state '%s' after %d seconds" % - (instanceName, newInstance.state, elapsed_secs)) + self.log.debug("VM %s: Waiting to reach 'running' from 'pending'" % instanceName) + time.sleep(config.Config.TIMER_POLL_INTERVAL) # end of while loop self.log.info( "VM %s | State %s | Reservation %s | Public DNS Name %s | Public IP Address %s" % (instanceName, newInstance.state, - reservation.id, + reservation, newInstance.public_dns_name, - newInstance.ip_address)) + newInstance.public_ip_address)) # Save domain and id ssigned by EC2 in vm object - vm.domain_name = newInstance.ip_address + vm.domain_name = newInstance.public_ip_address vm.ec2_id = newInstance.id self.log.debug("VM %s: %s" % (instanceName, newInstance)) return vm @@ -321,7 +312,7 @@ def initializeVM(self, vm): except Exception as e: self.log.debug("initializeVM Failed: %s" % e) if newInstance: - self.connection.terminate_instances(instance_ids=[newInstance.id]) + self.connection.instances.filter(InstanceIds=[newInstance.id]).terminate() return None def waitVM(self, vm, max_secs): @@ -376,6 +367,7 @@ def waitVM(self, vm, max_secs): # If the call to ssh returns timeout (-1) or ssh error # (255), then success. Otherwise, keep trying until we run # out of time. + ret = timeout(["ssh"] + self.ssh_flags + ["%s@%s" % (config.Config.EC2_USER_NAME, domain_name), "(:)"], max_secs - elapsed_secs) @@ -486,12 +478,6 @@ def destroyVM(self, vm): """ destroyVM - Removes a VM from the system """ - # test if the instance still exists - reservations = self.connection.get_all_instances(instance_ids=[vm.ec2_id]) - if not reservations: - self.log.info("destroyVM: instance non-exist %s %s" % (vm.ec2_id, vm.name)) - return [] - self.log.info("destroyVM: %s %s %s %s" % (vm.ec2_id, vm.name, vm.keepForDebugging, vm.notes)) # Keep the vm and mark with meaningful tags for debugging @@ -509,7 +495,7 @@ def destroyVM(self, vm): instance.create_tags(Tags=[{"Key": "Notes", "Value": vm.notes}]) return - ret = self.connection.terminate_instances(instance_ids=[vm.ec2_id]) + ret = self.connection.instances.filter(InstanceIds=[vm.ec2_id]).terminate() # delete dynamically created key if not self.useDefaultKeyPair: self.deleteKeyPair() @@ -519,38 +505,52 @@ def destroyVM(self, vm): def safeDestroyVM(self, vm): return self.destroyVM(vm) + # return None or tag value if key exists + def getTag(self, tagList, tagKey): + if tagList: + for tag in tagList: + if tag["Key"] == tagKey: + return tag["Value"] + return None + def getVMs(self): - """ getVMs - Returns the complete list of VMs on this account. Each + """ getVMs - Returns the running or pending VMs on this account. Each list entry is a boto.ec2.instance.Instance object. """ - # TODO: Find a way to return vm objects as opposed ec2 instance - # objects. - instances = list() - for i in self.connection.get_all_instances(): - if i.id is not config.Config.TANGO_RESERVATION_ID: - inst = i.instances.pop() - if inst.state_code is config.Config.INSTANCE_RUNNING: - instances.append(inst) vms = list() - for inst in instances: - vm = TangoMachine() - vm.ec2_id = inst.id - vm.name = str(inst.tags.get('Name')) - self.log.debug('getVMs: Instance - %s, EC2 Id - %s' % - (vm.name, vm.ec2_id)) - vms.append(vm) + filters=[{'Name': 'instance-state-name', 'Values': ['running', 'pending']}] + + for inst in self.connection.instances.filter(Filters=filters): + vm = TangoMachine() # make a Tango internal vm structure + vm.ec2_id = inst.id + vm.id = None # the serial number as in inst name PREFIX-serial-IMAGE + vm.domain_name = None + + vm.name = self.getTag(inst.tags, "Name") + # Name tag is the standard form of prefix-serial-image + if vm.name and re.match("%s-" % config.Config.PREFIX, vm.name): + vm.id = int(vm.name.split("-")[1]) + elif not vm.name: + vm.name = "Instance_id_" + inst.id + "_without_name_tag" + + if inst.public_ip_address: + vm.domain_name = inst.public_ip_address + + self.log.debug('getVMs: Instance id %s, name %s' % (vm.name, vm.ec2_id)) + vms.append(vm) return vms def existsVM(self, vm): """ existsVM - Checks whether a VM exists in the vmms. """ - instances = self.connection.get_all_instances() - for inst in instances: - if inst.instances[0].id == vm.ec2_id and inst.instances[0].state == "running": - return True + filters=[{'Name': 'instance-state-name', 'Values': ['running']}] + instances = self.connection.instances.filter(Filters=filters) + for inst in instances.filter(InstanceIds=[vm.ec2_id]): + self.log.debug("VM %s: exists and running" % vm.ec2_id) + return True return False def getImages(self): From 559f7aaa8b621aa6367e389d892f246c1e4ce8fe Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 29 Jun 2018 17:30:16 -0400 Subject: [PATCH 094/131] Clean up the tool that experiments on aws instances and tango vms. --- tools/ec2Read.py | 132 +++++++++++++++++++++++++---------------------- 1 file changed, 71 insertions(+), 61 deletions(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 2fa71803..6bf90300 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -20,29 +20,46 @@ class CommandLine(): def __init__(self): parser = argparse.ArgumentParser(description='List AWS vms and preallocator pools') - parser.add_argument('-d', '--instances', metavar='instance', nargs='+', - help="destroy vms by name tags or AWS ids (can be partial). \"NoNameTag\" (case insensitive) deletes all instances without a \"Name\" tag") - parser.add_argument('-l', '--list', action='store_true', dest='listVMs', help="list vms") - parser.add_argument('-e', '--emptyPools', action='store_true', dest='emptyPools', help="empty redis pools") + parser.add_argument('-d', '--destroyVMs', action='store_true', dest='destroyVMs', help="destroy VMs and empty pools") + parser.add_argument('-D', '--instanceNameTags', metavar='instance', nargs='+', + help="destroy instances by name tags or AWS ids (can be partial). \"None\" (case insensitive) deletes all instances without a \"Name\" tag") + parser.add_argument('-l', '--list', action='store_true', dest='listVMs', help="list and ping live vms") + parser.add_argument('-L', '--listAll', action='store_true', dest='listInstances', help="list all instances") self.args = parser.parse_args() cmdLine = CommandLine() -destroyList = cmdLine.args.instances -listVMs = cmdLine.args.listVMs -emptyPools = cmdLine.args.emptyPools -sortedInstances = [] +argDestroyInstanceNameTags = cmdLine.args.instanceNameTags +argListVMs = cmdLine.args.listVMs +argListAllInstances = cmdLine.args.listInstances +argDestroyVMs = cmdLine.args.destroyVMs local_tz = pytz.timezone("EST") def utc_to_local(utc_dt): local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) return local_tz.normalize(local_dt) -def destroyInstances(): +def destroyVMs(): vms = ec2.getVMs() + print "number of Tango VMs:", len(vms) for vm in vms: - if re.match("%s-" % Config.PREFIX, vm.name): - print "destroy", vm.name - ec2.destroyVM(vm) + print "destroy", vm.name + ec2.destroyVM(vm) + +def pingVMs(): + vms = ec2.getVMs() + print "number of Tango VMs:", len(vms) + for vm in vms: + if vm.id: + print "ping", vm.name, vm.id + # Note: following call needs the private key file for aws to be + # at wherever SECURITY_KEY_PATH in config.py points to. + # For example, if SECURITY_KEY_PATH = '/root/746-autograde.pem', + # then the file should exist there. + ec2.waitVM(vm, Config.WAITVM_TIMEOUT) + else: + print "VM not in Tango naming pattern:", vm.name + +# END of function definitions # local_tz = pytz.timezone("EST") @@ -76,33 +93,34 @@ def instanceNameTag(instance): name = tag["Value"] return name -def queryInstances(): - global sortedInstances +def listInstances(all=None): + sortedInstances = [] nameInstances = [] + instanceType = "all" response = boto3connection.describe_instances() for reservation in response["Reservations"]: for instance in reservation["Instances"]: - if instance["State"]["Name"] != "running": + if not all and instance["State"]["Name"] != "running": + instanceType = "running" continue - nameInstances.append({"Name": instanceNameTag(instance), "Instance": instance}) + nameInstances.append({"Name": instanceNameTag(instance), + "Instance": instance}) sortedInstances = sorted(nameInstances, key=lambda x: x["Name"]) - print len(sortedInstances), "instances:" - -def listInstances(knownInstances=None): - global sortedInstances - instanceList = [] - if knownInstances: - instanceList = knownInstances - else: - queryInstances() - instanceList = sortedInstances - - for item in instanceList: + print "number of", instanceType, "AWS instances:", len(sortedInstances) + + for item in sortedInstances: instance = item["Instance"] launchTime = utc_to_local(instance["LaunchTime"]) - print("%s: %s %s %s" % - (item["Name"], instance["InstanceId"], instance["PublicIpAddress"], launchTime)) + if "PublicIpAddress" in instance: + print("%s: %s %s %s %s" % + (item["Name"], instance["InstanceId"], + launchTime, instance["State"]["Name"], + instance["PublicIpAddress"])) + else: + print("%s: %s %s %s" % + (item["Name"], instance["InstanceId"], + launchTime, instance["State"]["Name"])) if "Tags" in instance: for tag in instance["Tags"]: if (tag["Key"] != "Name"): @@ -122,8 +140,10 @@ def listInstances(knownInstances=None): print("\t tag {%s: %s}" % (tag["Key"], tag["Value"])) """ + return sortedInstances + def listPools(): - print "pools", ec2.img2ami.keys() + print "Tango VM pools by AWS image", ec2.img2ami.keys() for key in server.preallocator.machines.keys(): pool = server.preallocator.getPool(key) totalPool = pool["total"] @@ -153,7 +173,6 @@ def allocateVMs(): free = server.preallocator.getPool(key)["free"] print "after allocation", key, total, free - # When a host has two Tango containers (for experiment), there are two # redis servers, too. They differ by the forwarding port number, which # is defined in config_for_run_jobs.py. To select the redis server, @@ -168,12 +187,11 @@ def allocateVMs(): ec2 = server.preallocator.vmms["ec2SSH"] pools = ec2.img2ami -if destroyList: - print "Current" - listInstances() +if argDestroyInstanceNameTags: + sortedInstances = listInstances() totalTerminated = [] - for partialStr in destroyList: + for partialStr in argDestroyInstanceNameTags: matchingInstances = [] if partialStr.lower() == "NoNameTag".lower(): # without "Name" tag for item in sortedInstances: @@ -205,44 +223,36 @@ def allocateVMs(): print "no instances matching query string \"%s\"" % partialStr # end of for loop partialStr - print "Aftermath" - listInstances() + print "Afterwards" + print "----------" + listInstances('all') + exit() + +if argListAllInstances: + listInstances("all") exit() -if listVMs: +if argListVMs: listInstances() listPools() + pingVMs() exit() -if emptyPools: +if argDestroyVMs: + destroyVMs() destroyRedisPools() + print "Afterwards" + print "----------" + listInstances() + listPools() exit() -listInstances() -listPools() -createInstances(1) -listInstances() -listPools() -exit() +# Start of main actions -destroyInstances() -destroyRedisPools() listInstances() listPools() -exit() - -createInstances(2) -listInstances() -listPools() -exit() - -allocateVMs() # should see some vms disappear from free pool +createInstances(1) listInstances() listPools() exit() -# resetTango will destroy all known vms that are NOT in free pool -server.resetTango(server.preallocator.vmms) -listInstances() -listPools() - From 9c1acfa98405c48925c91e7daad6b6a63b2a923c Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 2 Jul 2018 15:48:37 -0400 Subject: [PATCH 095/131] Cleanup the ec2 tool script. --- tools/ec2Read.py | 66 ++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 6bf90300..04a98d36 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -11,7 +11,6 @@ import redis import boto3 import pytz -import tzlocal import argparse # Read aws instances, Tango preallocator pools, etc. @@ -20,6 +19,7 @@ class CommandLine(): def __init__(self): parser = argparse.ArgumentParser(description='List AWS vms and preallocator pools') + parser.add_argument('-c', '--createVMs', action='store_true', dest='createVMs', help="create a VM for each pool") parser.add_argument('-d', '--destroyVMs', action='store_true', dest='destroyVMs', help="destroy VMs and empty pools") parser.add_argument('-D', '--instanceNameTags', metavar='instance', nargs='+', help="destroy instances by name tags or AWS ids (can be partial). \"None\" (case insensitive) deletes all instances without a \"Name\" tag") @@ -28,15 +28,11 @@ def __init__(self): self.args = parser.parse_args() cmdLine = CommandLine() -argDestroyInstanceNameTags = cmdLine.args.instanceNameTags +argDestroyInstanceByNameTags = cmdLine.args.instanceNameTags argListVMs = cmdLine.args.listVMs argListAllInstances = cmdLine.args.listInstances argDestroyVMs = cmdLine.args.destroyVMs - -local_tz = pytz.timezone("EST") -def utc_to_local(utc_dt): - local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) - return local_tz.normalize(local_dt) +argCreateVMs = cmdLine.args.createVMs def destroyVMs(): vms = ec2.getVMs() @@ -59,31 +55,37 @@ def pingVMs(): else: print "VM not in Tango naming pattern:", vm.name -# END of function definitions # - local_tz = pytz.timezone("EST") - def utc_to_local(utc_dt): local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) return local_dt.strftime("%Y%m%d-%H:%M:%S") # to test destroying instances without "Name" tag -def deleteNameTag(): - response = boto3connection.describe_instances() - for reservation in response["Reservations"]: - for instance in reservation["Instances"]: - boto3connection.delete_tags(Resources=[instance["InstanceId"]], - Tags=[{"Key": "Name"}]) +def deleteNameTagForAllInstances(): + instances = listInstances() + for instance in instances: + boto3connection.delete_tags(Resources=[instance["Instance"]["InstanceId"]], + Tags=[{"Key": "Name"}]) + print "Afterwards" + print "----------" + listInstances() # to test changing tags to keep the vm after test failure -def changeTags(instanceId, name, notes): - print "change tags for", instanceId - instance = boto3resource.Instance(instanceId) - tag = boto3resource.Tag(instanceId, "Name", name) - if tag: - tag.delete() - instance.create_tags(Tags=[{"Key": "Name", "Value": "failed-" + name}]) - instance.create_tags(Tags=[{"Key": "Notes", "Value": notes}]) +def changeTagForAllInstances(): + instances = listInstances() + for inst in instances: + instance = inst["Instance"] + name = inst["Name"] + notes = "tag " + name + " deleted" + boto3connection.delete_tags(Resources=[instance["InstanceId"]], + Tags=[{"Key": "Name"}]) + boto3connection.create_tags(Resources=[instance["InstanceId"]], + Tags=[{"Key": "Name", "Value": "failed-" + name}, + {"Key": "Notes", "Value": notes}]) + + print "Afterwards" + print "----------" + listInstances() def instanceNameTag(instance): name = "None" @@ -173,6 +175,8 @@ def allocateVMs(): free = server.preallocator.getPool(key)["free"] print "after allocation", key, total, free +# END of function definitions # + # When a host has two Tango containers (for experiment), there are two # redis servers, too. They differ by the forwarding port number, which # is defined in config_for_run_jobs.py. To select the redis server, @@ -187,11 +191,11 @@ def allocateVMs(): ec2 = server.preallocator.vmms["ec2SSH"] pools = ec2.img2ami -if argDestroyInstanceNameTags: +if argDestroyInstanceByNameTags: sortedInstances = listInstances() totalTerminated = [] - for partialStr in argDestroyInstanceNameTags: + for partialStr in argDestroyInstanceByNameTags: matchingInstances = [] if partialStr.lower() == "NoNameTag".lower(): # without "Name" tag for item in sortedInstances: @@ -247,7 +251,15 @@ def allocateVMs(): listPools() exit() -# Start of main actions +if argCreateVMs: + listInstances() + listPools() + createInstances(1) + listInstances() + listPools() + exit() + +# For combination of ops not provided by the command line options: listInstances() listPools() From 13f7363a38d5a544f9c6827c3462b55446053112 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 5 Jul 2018 11:35:26 -0400 Subject: [PATCH 096/131] ec2 tool script uses boto3 API consistent with Tango code. --- tools/ec2Read.py | 170 ++++++++++++++++++++++------------------------- 1 file changed, 79 insertions(+), 91 deletions(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 04a98d36..1eaede2b 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -18,13 +18,19 @@ class CommandLine(): def __init__(self): - parser = argparse.ArgumentParser(description='List AWS vms and preallocator pools') - parser.add_argument('-c', '--createVMs', action='store_true', dest='createVMs', help="create a VM for each pool") - parser.add_argument('-d', '--destroyVMs', action='store_true', dest='destroyVMs', help="destroy VMs and empty pools") - parser.add_argument('-D', '--instanceNameTags', metavar='instance', nargs='+', + parser = argparse.ArgumentParser( + description='List AWS vms and preallocator pools') + parser.add_argument('-c', '--createVMs', action='store_true', + dest='createVMs', help="create a VM for each pool") + parser.add_argument('-d', '--destroyVMs', action='store_true', + dest='destroyVMs', help="destroy VMs and empty pools") + parser.add_argument('-D', '--instanceNameTags', metavar='instance', + nargs='+', help="destroy instances by name tags or AWS ids (can be partial). \"None\" (case insensitive) deletes all instances without a \"Name\" tag") - parser.add_argument('-l', '--list', action='store_true', dest='listVMs', help="list and ping live vms") - parser.add_argument('-L', '--listAll', action='store_true', dest='listInstances', help="list all instances") + parser.add_argument('-l', '--list', action='store_true', + dest='listVMs', help="list and ping live vms") + parser.add_argument('-L', '--listAll', action='store_true', + dest='listInstances', help="list all instances") self.args = parser.parse_args() cmdLine = CommandLine() @@ -64,7 +70,7 @@ def utc_to_local(utc_dt): def deleteNameTagForAllInstances(): instances = listInstances() for instance in instances: - boto3connection.delete_tags(Resources=[instance["Instance"]["InstanceId"]], + boto3connection.delete_tags(Resources=[instance["Instance"].id], Tags=[{"Key": "Name"}]) print "Afterwards" print "----------" @@ -87,62 +93,52 @@ def changeTagForAllInstances(): print "----------" listInstances() -def instanceNameTag(instance): - name = "None" - if "Tags" in instance: - for tag in instance["Tags"]: - if tag["Key"] == "Name": - name = tag["Value"] - return name - def listInstances(all=None): - sortedInstances = [] - nameInstances = [] + nameAndInstances = [] + + filters=[] instanceType = "all" - response = boto3connection.describe_instances() - for reservation in response["Reservations"]: - for instance in reservation["Instances"]: - if not all and instance["State"]["Name"] != "running": - instanceType = "running" - continue - nameInstances.append({"Name": instanceNameTag(instance), - "Instance": instance}) - - sortedInstances = sorted(nameInstances, key=lambda x: x["Name"]) - print "number of", instanceType, "AWS instances:", len(sortedInstances) - - for item in sortedInstances: + if not all: + filters=[{'Name': 'instance-state-name', 'Values': ['running']}] + instanceType = "running" + + instances = boto3resource.instances.filter(Filters=filters) + for instance in boto3resource.instances.filter(Filters=filters): + nameAndInstances.append({"Name": ec2.getTag(instance.tags, "Name"), + "Instance": instance}) + + nameAndInstances.sort(key=lambda x: x["Name"]) + print "number of", instanceType, "AWS instances:", len(nameAndInstances) + + for item in nameAndInstances: instance = item["Instance"] - launchTime = utc_to_local(instance["LaunchTime"]) - if "PublicIpAddress" in instance: + launchTime = utc_to_local(instance.launch_time) + if instance.public_ip_address: print("%s: %s %s %s %s" % - (item["Name"], instance["InstanceId"], - launchTime, instance["State"]["Name"], - instance["PublicIpAddress"])) + (item["Name"], instance.id, + launchTime, instance.state["Name"], + instance.public_ip_address)) else: print("%s: %s %s %s" % - (item["Name"], instance["InstanceId"], - launchTime, instance["State"]["Name"])) - if "Tags" in instance: - for tag in instance["Tags"]: + (item["Name"], instance.id, + launchTime, instance.state["Name"])) + + if instance.tags: + for tag in instance.tags: if (tag["Key"] != "Name"): print("\t tag {%s: %s}" % (tag["Key"], tag["Value"])) else: print("\t No tags") - """ useful sometimes - print "ImageId:", instance["ImageId"] - print "PublicDnsName:", instance["PublicDnsName"] - print "InstanceType:", instance["InstanceType"] - print "State:", instance["State"]["Name"] - print "SecurityGroups:", instance["SecurityGroups"] - image = boto3resource.Image(instance["ImageId"]) - print "Image:", image.image_id + """ useful sometimes + print "\t InstanceType:", instance.instance_type + image = boto3resource.Image(instance.image_id) + print "\t ImageId:", image.image_id for tag in image.tags: - print("\t tag {%s: %s}" % (tag["Key"], tag["Value"])) - """ + print("\t\t image tag {%s: %s}" % (tag["Key"], tag["Value"])) + """ - return sortedInstances + return nameAndInstances def listPools(): print "Tango VM pools by AWS image", ec2.img2ami.keys() @@ -154,7 +150,8 @@ def listPools(): freePool.sort() print "pool", key, "total", len(totalPool), totalPool, freePool -def createInstances(num): +# allocate "num" vms for each and every pool (image) +def createVMs(num): for imageName in pools: (poolName, ext) = os.path.splitext(imageName) print "creating", num, "for pool", poolName @@ -167,14 +164,6 @@ def destroyRedisPools(): server.preallocator.machines.set(key, [[], TangoQueue(key)]) server.preallocator.machines.get(key)[1].make_empty() -def allocateVMs(): - freeList = [] - for key in server.preallocator.machines.keys(): - server.preallocator.allocVM(key) - total = server.preallocator.getPool(key)["total"] - free = server.preallocator.getPool(key)["free"] - print "after allocation", key, total, free - # END of function definitions # # When a host has two Tango containers (for experiment), there are two @@ -192,44 +181,43 @@ def allocateVMs(): pools = ec2.img2ami if argDestroyInstanceByNameTags: - sortedInstances = listInstances() + nameAndInstances = listInstances() totalTerminated = [] + matchingInstances = [] for partialStr in argDestroyInstanceByNameTags: - matchingInstances = [] - if partialStr.lower() == "NoNameTag".lower(): # without "Name" tag - for item in sortedInstances: - if "None" == instanceNameTag(item["Instance"]): - matchingInstances.append(item) - elif partialStr.startswith("i-"): # match instance id - for item in sortedInstances: - if item["Instance"]["InstanceId"].startswith(partialStr): + if partialStr.startswith("i-"): # match instance id + for item in nameAndInstances: + if item["Instance"].id.startswith(partialStr): matchingInstances.append(item) else: - for item in sortedInstances: # match a "Name" tag that is not None - if instanceNameTag(item["Instance"]).startswith(partialStr) or \ - instanceNameTag(item["Instance"]).endswith(partialStr): + # part of "Name" tag or None to match instances without name tag + for item in nameAndInstances: + nameTag = ec2.getTag(item["Instance"].tags, "Name") + if nameTag and \ + (nameTag.startswith(partialStr) or nameTag.endswith(partialStr)): + matchingInstances.append(item) + elif not nameTag and partialStr == "None": matchingInstances.append(item) - # remove the items already terminated - instancesToTerminate = [] - for item in matchingInstances: - if not any(x["Instance"]["InstanceId"] == item["Instance"]["InstanceId"] for x in totalTerminated): - instancesToTerminate.append(item) - totalTerminated.append(item) - - if instancesToTerminate: - print "terminate %d instances matching query string \"%s\"" % (len(instancesToTerminate), partialStr) - listInstances(instancesToTerminate) - for item in instancesToTerminate: - boto3connection.terminate_instances(InstanceIds=[item["Instance"]["InstanceId"]]) - else: - print "no instances matching query string \"%s\"" % partialStr - # end of for loop partialStr + # the loop above may generate duplicates in matchingInstances + terminatedInstances = [] + for item in matchingInstances: + if item["Instance"].id not in terminatedInstances: + boto3connection.terminate_instances(InstanceIds=[item["Instance"].id]) + terminatedInstances.append(item["Instance"].id) + + if terminatedInstances: + print "terminate %d instances matching query string \"%s\":" % \ + (len(terminatedInstances), argDestroyInstanceByNameTags) + for id in terminatedInstances: + print id + print "Afterwards" + print "----------" + listInstances() + else: + print "no instances matching query string \"%s\"" % argDestroyInstanceByNameTags - print "Afterwards" - print "----------" - listInstances('all') exit() if argListAllInstances: @@ -254,7 +242,7 @@ def allocateVMs(): if argCreateVMs: listInstances() listPools() - createInstances(1) + createVMs(1) listInstances() listPools() exit() @@ -263,7 +251,7 @@ def allocateVMs(): listInstances() listPools() -createInstances(1) +createVMs(1) listInstances() listPools() exit() From 788ee1dad6522cef9acce1c7341190f7e3cf3cf1 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 9 Jul 2018 16:17:02 -0400 Subject: [PATCH 097/131] consolidate boto3 connection naming, make aws access id/key work again and add tests. --- config.template.py | 3 +- tools/ec2Read.py | 44 +++++++++++++++++------- vmms/ec2SSH.py | 85 +++++++++++++++++++++------------------------- 3 files changed, 72 insertions(+), 60 deletions(-) diff --git a/config.template.py b/config.template.py index 6075c54f..2ed3b115 100644 --- a/config.template.py +++ b/config.template.py @@ -165,10 +165,11 @@ class Config: EC2_REGION = '' EC2_USER_NAME = '' + KEEP_VM_AFTER_FAILURE = False DEFAULT_INST_TYPE = '' DEFAULT_SECURITY_GROUP = '' SECURITY_KEY_PATH = '' - DYNAMIC_SECURITY_KEY_PATH = '' + DYNAMIC_SECURITY_KEY_PATH = '' # key file placed at root "/" by default SECURITY_KEY_NAME = '' TANGO_RESERVATION_ID = '' INSTANCE_RUNNING = 16 # Status code of a instance that is running diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 1eaede2b..a4474d8c 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -20,12 +20,13 @@ class CommandLine(): def __init__(self): parser = argparse.ArgumentParser( description='List AWS vms and preallocator pools') + parser.add_argument('-a', '--accessIdKeyUser', + help="aws access id, key and user, space separated") parser.add_argument('-c', '--createVMs', action='store_true', dest='createVMs', help="create a VM for each pool") parser.add_argument('-d', '--destroyVMs', action='store_true', dest='destroyVMs', help="destroy VMs and empty pools") - parser.add_argument('-D', '--instanceNameTags', metavar='instance', - nargs='+', + parser.add_argument('-D', '--instanceNameTags', nargs='+', help="destroy instances by name tags or AWS ids (can be partial). \"None\" (case insensitive) deletes all instances without a \"Name\" tag") parser.add_argument('-l', '--list', action='store_true', dest='listVMs', help="list and ping live vms") @@ -39,20 +40,24 @@ def __init__(self): argListAllInstances = cmdLine.args.listInstances argDestroyVMs = cmdLine.args.destroyVMs argCreateVMs = cmdLine.args.createVMs +argAccessIdKeyUser = cmdLine.args.accessIdKeyUser def destroyVMs(): vms = ec2.getVMs() print "number of Tango VMs:", len(vms) for vm in vms: - print "destroy", vm.name - ec2.destroyVM(vm) - + if vm.id: + print "destroy", ec2.instanceName(vm.id, vm.name) + ec2.destroyVM(vm) + else: + print "VM not in Tango naming pattern:", vm.name + def pingVMs(): vms = ec2.getVMs() print "number of Tango VMs:", len(vms) for vm in vms: if vm.id: - print "ping", vm.name, vm.id + print "ping", ec2.instanceName(vm.id, vm.name) # Note: following call needs the private key file for aws to be # at wherever SECURITY_KEY_PATH in config.py points to. # For example, if SECURITY_KEY_PATH = '/root/746-autograde.pem', @@ -247,12 +252,25 @@ def destroyRedisPools(): listPools() exit() -# For combination of ops not provided by the command line options: +# ec2WithKey can be used to test the case that tango_cli uses +# non-default aws access id and key +if argAccessIdKeyUser: + if len(argAccessIdKeyUser.split()) != 3: + print "access id, key and user must be quoted and space separated" + exit() + (id, key, user) = argAccessIdKeyUser.split() + ec2WithKey = Ec2SSH(accessKeyId=id, accessKey=key, ec2User=user) + vm = TangoMachine(vmms="ec2SSH") + vm.id = int(2000) # a high enough number to avoid collision + # to test non-default access id/key, the aws image must have the key manually + # installed or allows the key to be installed by the aws service. + # the following assumes we have such image with a "Name" tag "test01.img" + vm.name = "test01" + ec2WithKey.initializeVM(vm) + ec2WithKey.waitVM(vm, Config.WAITVM_TIMEOUT) + listInstances() + +# Write combination of ops not provided by the command line options here: + -listInstances() -listPools() -createVMs(1) -listInstances() -listPools() -exit() diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 7693d361..12314879 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -1,13 +1,8 @@ # # ec2SSH.py - Implements the Tango VMMS interface to run Tango jobs on Amazon EC2. # -# This implementation uses the AWS EC2 SDK to manage the virtual machines and -# ssh and scp to access them. The following excecption are raised back -# to the caller: -# -# Ec2Exception - EC2 raises this if it encounters any problem -# ec2CallError - raised by ec2Call() function -# +# ssh and scp to access them. + import subprocess import os import re @@ -15,13 +10,11 @@ import logging import config +from tangoObjects import TangoMachine import boto3 -from boto3 import ec2 from botocore.exceptions import ClientError -from tangoObjects import TangoMachine - ### added to suppress boto XML output -- Jason Boles logging.getLogger('boto3').setLevel(logging.CRITICAL) logging.getLogger('botocore').setLevel(logging.CRITICAL) @@ -54,7 +47,6 @@ def timeout(command, time_out=1): returncode = p.poll() return returncode - def timeoutWithReturnStatus(command, time_out, returnValue=0): """ timeoutWithReturnStatus - Run a Unix command with a timeout, until the expected value is returned by the command; On timeout, @@ -76,22 +68,12 @@ def timeoutWithReturnStatus(command, time_out, returnValue=0): stderr=subprocess.STDOUT) return ret -# -# User defined exceptions -# -# ec2Call() exception - - -class ec2CallError(Exception): - pass - - class Ec2SSH: _SSH_FLAGS = ["-i", config.Config.SECURITY_KEY_PATH, "-o", "StrictHostKeyChecking no", "-o", "GSSAPIAuthentication no"] - def __init__(self, accessKeyId=None, accessKey=None): + def __init__(self, accessKeyId=None, accessKey=None, ec2User=None): """ log - logger for the instance connection - EC2Connection object that stores the connection info to the EC2 network @@ -107,16 +89,17 @@ def __init__(self, accessKeyId=None, accessKey=None): self.useDefaultKeyPair = False if accessKeyId else True self.boto3connection = boto3.client("ec2", config.Config.EC2_REGION, - aws_access_key_id=accessKeyId, aws_secret_access_key=accessKey) - self.connection = self.boto3resource = boto3.resource("ec2", config.Config.EC2_REGION) + aws_access_key_id=accessKeyId, + aws_secret_access_key=accessKey) + self.boto3resource = boto3.resource("ec2", config.Config.EC2_REGION) + self.ec2User = config.Config.EC2_USER_NAME if not ec2User else ec2User # Use boto3 to read images. Find the "Name" tag and use it as key to # build a map from "Name tag" to boto3's image structure. # The code is currently using boto 2 for most of the work and we don't # have the energy to upgrade it yet. So boto and boto3 are used together. - client = boto3.client("ec2", config.Config.EC2_REGION) - images = client.describe_images(Owners=["self"])["Images"] + images = self.boto3connection.describe_images(Owners=["self"])["Images"] self.img2ami = {} for image in images: if "Tags" not in image: @@ -196,15 +179,21 @@ def tangoMachineToEC2Instance(self, vm): def createKeyPair(self): # try to delete the key to avoid collision self.key_pair_path = "%s/%s.pem" % \ - (config.Config.DYNAMIC_SECURITY_KEY_PATH, self.key_pair_name) + (config.Config.DYNAMIC_SECURITY_KEY_PATH, + self.key_pair_name) self.deleteKeyPair() - key = self.connection.create_key_pair(self.key_pair_name) - key.save(config.Config.DYNAMIC_SECURITY_KEY_PATH) + response = self.boto3connection.create_key_pair(KeyName=self.key_pair_name) + keyFile = open(self.key_pair_path, "w+") + keyFile.write(response["KeyMaterial"]) + os.chmod(self.key_pair_path, 0o600) + keyFile.close() + # change the SSH_FLAG accordingly self.ssh_flags[1] = self.key_pair_path + return self.key_pair_path def deleteKeyPair(self): - self.connection.delete_key_pair(self.key_pair_name) + self.boto3connection.delete_key_pair(KeyName=self.key_pair_name) # try to delete may not exist key file try: os.remove(self.key_pair_path) @@ -214,11 +203,11 @@ def deleteKeyPair(self): def createSecurityGroup(self): # Create may-exist security group try: - response = self.connection.create_security_group( + response = self.boto3resource.create_security_group( GroupName=config.Config.DEFAULT_SECURITY_GROUP, Description="Autolab security group - allowing all traffic") security_group_id = response['GroupId'] - self.connection.authorize_security_group_ingress( + self.boto3resource.authorize_security_group_ingress( GroupId=security_group_id) except ClientError as e: pass @@ -244,9 +233,9 @@ def initializeVM(self, vm): self.key_pair_path = config.Config.SECURITY_KEY_PATH else: self.key_pair_name = self.keyPairName(vm.id, vm.name) - self.createKeyPair() + self.key_pair_path = self.createKeyPair() - reservation = self.connection.create_instances(ImageId=ec2instance['ami'], + reservation = self.boto3resource.create_instances(ImageId=ec2instance['ami'], InstanceType=ec2instance['instance_type'], KeyName=self.key_pair_name, SecurityGroups=[ @@ -261,7 +250,7 @@ def initializeVM(self, vm): newInstance = reservation[0] if newInstance: # Assign name to EC2 instance - self.connection.create_tags(Resources=[newInstance.id], + self.boto3resource.create_tags(Resources=[newInstance.id], Tags=[{"Key": "Name", "Value": instanceName}]) self.log.info("new instance %s created with name tag %s" % (newInstance.id, instanceName)) @@ -276,7 +265,7 @@ def initializeVM(self, vm): # running intances and find our instance by instance id filters=[{'Name': 'instance-state-name', 'Values': ['running']}] - instances = self.connection.instances.filter(Filters=filters) + instances = self.boto3resource.instances.filter(Filters=filters) instanceRunning = False newInstance.load() # reload the state of the instance @@ -312,7 +301,7 @@ def initializeVM(self, vm): except Exception as e: self.log.debug("initializeVM Failed: %s" % e) if newInstance: - self.connection.instances.filter(InstanceIds=[newInstance.id]).terminate() + self.boto3resource.instances.filter(InstanceIds=[newInstance.id]).terminate() return None def waitVM(self, vm, max_secs): @@ -369,7 +358,7 @@ def waitVM(self, vm, max_secs): # out of time. ret = timeout(["ssh"] + self.ssh_flags + - ["%s@%s" % (config.Config.EC2_USER_NAME, domain_name), + ["%s@%s" % (self.ec2User, domain_name), "(:)"], max_secs - elapsed_secs) self.log.debug("VM %s: ssh returned with %d" % @@ -495,7 +484,7 @@ def destroyVM(self, vm): instance.create_tags(Tags=[{"Key": "Notes", "Value": vm.notes}]) return - ret = self.connection.instances.filter(InstanceIds=[vm.ec2_id]).terminate() + ret = self.boto3resource.instances.filter(InstanceIds=[vm.ec2_id]).terminate() # delete dynamically created key if not self.useDefaultKeyPair: self.deleteKeyPair() @@ -521,23 +510,27 @@ def getVMs(self): vms = list() filters=[{'Name': 'instance-state-name', 'Values': ['running', 'pending']}] - for inst in self.connection.instances.filter(Filters=filters): + for inst in self.boto3resource.instances.filter(Filters=filters): vm = TangoMachine() # make a Tango internal vm structure vm.ec2_id = inst.id vm.id = None # the serial number as in inst name PREFIX-serial-IMAGE vm.domain_name = None - vm.name = self.getTag(inst.tags, "Name") + instName = self.getTag(inst.tags, "Name") # Name tag is the standard form of prefix-serial-image - if vm.name and re.match("%s-" % config.Config.PREFIX, vm.name): - vm.id = int(vm.name.split("-")[1]) - elif not vm.name: + if instName and re.match("%s-" % config.Config.PREFIX, instName): + vm.id = int(instName.split("-")[1]) + vm.name = instName.split("-")[2] + elif not instName: vm.name = "Instance_id_" + inst.id + "_without_name_tag" + else: + vm.name = instName if inst.public_ip_address: vm.domain_name = inst.public_ip_address - self.log.debug('getVMs: Instance id %s, name %s' % (vm.name, vm.ec2_id)) + self.log.debug('getVMs: Instance id %s, pool %s, vm id %s' % \ + (vm.ec2_id, vm.name, vm.id)) vms.append(vm) return vms @@ -547,7 +540,7 @@ def existsVM(self, vm): """ filters=[{'Name': 'instance-state-name', 'Values': ['running']}] - instances = self.connection.instances.filter(Filters=filters) + instances = self.boto3resource.instances.filter(Filters=filters) for inst in instances.filter(InstanceIds=[vm.ec2_id]): self.log.debug("VM %s: exists and running" % vm.ec2_id) return True From 2b0ea16b3a00f0768243c248d374f1ced2ea6861 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 10 Jul 2018 17:23:28 -0400 Subject: [PATCH 098/131] Use boto3 resource API to process images for consistent use of boto3 api. --- vmms/ec2SSH.py | 80 +++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 12314879..bbcc0cf9 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -72,6 +72,7 @@ class Ec2SSH: _SSH_FLAGS = ["-i", config.Config.SECURITY_KEY_PATH, "-o", "StrictHostKeyChecking no", "-o", "GSSAPIAuthentication no"] + _SECURITY_KEY_PATH_INDEX_IN_SSH_FLAGS = 1 def __init__(self, accessKeyId=None, accessKey=None, ec2User=None): """ log - logger for the instance @@ -82,48 +83,49 @@ def __init__(self, accessKeyId=None, accessKey=None, ec2User=None): """ self.log = logging.getLogger("Ec2SSH-" + str(os.getpid())) - self.log.info("init Ec2SSH") self.ssh_flags = Ec2SSH._SSH_FLAGS - + self.ec2User = ec2User if ec2User else config.Config.EC2_USER_NAME self.useDefaultKeyPair = False if accessKeyId else True - self.boto3connection = boto3.client("ec2", config.Config.EC2_REGION, - aws_access_key_id=accessKeyId, - aws_secret_access_key=accessKey) + + self.boto3client = boto3.client("ec2", config.Config.EC2_REGION, + aws_access_key_id=accessKeyId, + aws_secret_access_key=accessKey) self.boto3resource = boto3.resource("ec2", config.Config.EC2_REGION) - self.ec2User = config.Config.EC2_USER_NAME if not ec2User else ec2User - # Use boto3 to read images. Find the "Name" tag and use it as key to - # build a map from "Name tag" to boto3's image structure. - # The code is currently using boto 2 for most of the work and we don't - # have the energy to upgrade it yet. So boto and boto3 are used together. + # Note: By convention, all usable images to Tango must have "Name" tag + # in the form of xyz.img which is the VM image in Autolab for an assignment. + # xyz is also the preallocator pool name for vms using this image. - images = self.boto3connection.describe_images(Owners=["self"])["Images"] self.img2ami = {} + images = self.boto3resource.images.filter(Owners=["self"]) for image in images: - if "Tags" not in image: - continue - tags = image["Tags"] - for tag in tags: - if "Key" in tag and tag["Key"] == "Name": - if not (tag["Value"] and tag["Value"].endswith(".img")): - self.log.info("Ignore %s for ill-formed name tag %s" % - (image["ImageId"], tag["Value"])) - continue - if tag["Value"] in self.img2ami: - self.log.info("Ignore %s for duplicate name tag %s" % - (image["ImageId"], tag["Value"])) - continue - + if image.tags: + for tag in image.tags: + if tag["Key"] == "Name": + if tag["Value"] and tag["Value"].endswith(".img"): + if tag["Value"] in self.img2ami: + self.log.info("Ignore %s for duplicate name tag %s" % + (image.id, tag["Value"])) + else: self.img2ami[tag["Value"]] = image - self.log.info("Found image: %s %s %s" % (tag["Value"], image["ImageId"], image["Name"])) + self.log.info("Found image: %s with name tag %s" % + (image.id, tag["Value"])) + elif tag["Value"]: + self.log.info("Ignore %s with ill-formed name tag %s" % + (image.id, tag["Value"])) + + imageAMIs = [item.id for item in images] + taggedAMIs = [self.img2ami[key].id for key in self.img2ami] + ignoredAMIs = list(set(imageAMIs) - set(taggedAMIs)) + if (len(ignoredAMIs) > 0): + self.log.info("Ignored images %s for lack of or ill-formed name tag" % + str(ignoredAMIs)) - imageAmis = [item["ImageId"] for item in images] - taggedAmis = [self.img2ami[key]["ImageId"] for key in self.img2ami] - ignoredAmis = list(set(imageAmis) - set(taggedAmis)) - if (len(ignoredAmis) > 0): - self.log.info("Ignored amis %s due to lack of proper name tag" % str(ignoredAmis)) + # + # VMMS helper methods + # def instanceName(self, id, name): """ instanceName - Constructs a VM instance name. Always use @@ -142,9 +144,6 @@ def domainName(self, vm): instance. """ return vm.domain_name - # - # VMMS helper methods - # def tangoMachineToEC2Instance(self, vm): """ tangoMachineToEC2Instance - returns an object with EC2 instance @@ -171,7 +170,7 @@ def tangoMachineToEC2Instance(self, vm): else: ec2instance['instance_type'] = config.Config.DEFAULT_INST_TYPE - ec2instance['ami'] = self.img2ami[vm.name + ".img"]["ImageId"] + ec2instance['ami'] = self.img2ami[vm.name + ".img"].id self.log.info("tangoMachineToEC2Instance: %s" % str(ec2instance)) return ec2instance @@ -182,18 +181,18 @@ def createKeyPair(self): (config.Config.DYNAMIC_SECURITY_KEY_PATH, self.key_pair_name) self.deleteKeyPair() - response = self.boto3connection.create_key_pair(KeyName=self.key_pair_name) + response = self.boto3client.create_key_pair(KeyName=self.key_pair_name) keyFile = open(self.key_pair_path, "w+") keyFile.write(response["KeyMaterial"]) - os.chmod(self.key_pair_path, 0o600) + os.chmod(self.key_pair_path, 0o600) # read only by owner keyFile.close() # change the SSH_FLAG accordingly - self.ssh_flags[1] = self.key_pair_path + self.ssh_flags[Ec2SSH._SECURITY_KEY_PATH_INDEX_IN_SSH_FLAGS] = self.key_pair_path return self.key_pair_path def deleteKeyPair(self): - self.boto3connection.delete_key_pair(KeyName=self.key_pair_name) + self.boto3client.delete_key_pair(KeyName=self.key_pair_name) # try to delete may not exist key file try: os.remove(self.key_pair_path) @@ -215,6 +214,7 @@ def createSecurityGroup(self): # # VMMS API functions # + def initializeVM(self, vm): """ initializeVM - Tell EC2 to create a new VM instance. return None on failure @@ -529,7 +529,7 @@ def getVMs(self): if inst.public_ip_address: vm.domain_name = inst.public_ip_address - self.log.debug('getVMs: Instance id %s, pool %s, vm id %s' % \ + self.log.debug('getVMs: Instance id %s, pool %s, vm id %s' % (vm.ec2_id, vm.name, vm.id)) vms.append(vm) From ce3f9a47136f175bf1b0b5c6bfcef1c74f0d9d21 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 13 Jul 2018 15:57:17 -0400 Subject: [PATCH 099/131] In detachVM, remove the replace_vm feature which allows a "bad" vm (os error returned) to be replaced by creating a new one immediately. However, it interferes with the "low water mark" feature that aims to keep free vms below the water mark. On the other hand, a destroyed vm will be replaced anyway, when a new job comes in and needs a vm, given the total vms are below the upper limit. In such case, the job has to wait a bit longer for the new vm's readiness, a small price to pay. --- worker.py | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/worker.py b/worker.py index ba0bc9d2..213c5229 100644 --- a/worker.py +++ b/worker.py @@ -40,12 +40,10 @@ def __init__(self, job, vmms, jobQueue, preallocator, preVM): # # Worker helper functions # - def detachVM(self, return_vm=False, replace_vm=False): + def detachVM(self, return_vm=False): """ detachVM - Detach the VM from this worker. The options are to return it to the pool's free list (return_vm), destroy it - (not return_vm), and if destroying it, whether to replace it - or not in the pool (replace_vm). The worker must always call - this function before returning. + (if not return_vm). """ # job-owned instance, simply destroy after job is completed if self.job.accessKeyId: @@ -55,13 +53,6 @@ def detachVM(self, return_vm=False, replace_vm=False): self.preallocator.freeVM(self.job.vm) else: self.vmms.safeDestroyVM(self.job.vm) - if replace_vm: - self.preallocator.createVM(self.job.vm) - - # Important: don't remove the VM from the pool until its - # replacement has been created. Otherwise there is a - # potential race where the job manager thinks that the - # pool is empty and creates a spurious vm. self.log.info("removeVM %s" % self.job.vm.id); self.preallocator.removeVM(self.job.vm) @@ -77,7 +68,7 @@ def rescheduleJob(self, hdrfile, ret, err): os.remove(hdrfile) except OSError: pass - self.detachVM(return_vm=False, replace_vm=True) + self.detachVM(return_vm=False) self.jobQueue.unassignJob(self.job.id) # Here is where we give up @@ -97,7 +88,7 @@ def rescheduleJob(self, hdrfile, ret, err): ret["copyout"])) self.catFiles(hdrfile, self.job.outputFile) - self.detachVM(return_vm=False, replace_vm=True) + self.detachVM(return_vm=False) self.notifyServer(self.job) def appendMsg(self, filename, msg): @@ -144,8 +135,7 @@ def notifyServer(self, job): except Exception as e: self.log.debug("Error in notifyServer: %s" % str(e)) - def afterJobExecution(self, hdrfile, msg, vmHandling): - (returnVM, replaceVM) = vmHandling + def afterJobExecution(self, hdrfile, msg, returnVM): self.jobQueue.makeDead(self.job.id, msg) # Update the text that users see in the autodriver output file @@ -153,7 +143,7 @@ def afterJobExecution(self, hdrfile, msg, vmHandling): self.catFiles(hdrfile, self.job.outputFile) # Thread exit after termination - self.detachVM(return_vm=returnVM, replace_vm=replaceVM) + self.detachVM(return_vm=returnVM) self.notifyServer(self.job) return @@ -206,7 +196,7 @@ def run(self): self.jobLogAndTrace("assigned VM (just initialized)", self.job.vm) vm = self.job.vm - (returnVM, replaceVM) = (True, False) + returnVM = True # Wait for the instance to be ready self.jobLogAndTrace("waiting for VM", vm) @@ -235,7 +225,7 @@ def run(self): if ret["copyin"] != 0: Config.copyin_errors += 1 msg = "Error: Copy in to VM failed (status=%d)" % (ret["copyin"]) - self.afterJobExecution(hdrfile, msg, (returnVM, replaceVM)) + self.afterJobExecution(hdrfile, msg, returnVM) return # Run the job on the virtual machine @@ -265,9 +255,9 @@ def run(self): # and do not retry the job since the job may have damaged # the VM. msg = "Error: OS error while running job on VM" - (returnVM, replaceVM) = (False, True) - # doNotDestroy, combined with KEEP_VM_AFTER_FAILURE, will sent - # the vm aside for further investigation after failure. + returnVM = False + # doNotDestroy, combined with KEEP_VM_AFTER_FAILURE, will + # set the vm aside for further investigation after failure. self.job.vm.keepForDebugging = True self.job.vm.notes = str(self.job.id) + "_" + self.job.name else: # This should never happen @@ -279,7 +269,7 @@ def run(self): else: msg = "Success: Autodriver returned normally" - self.afterJobExecution(hdrfile, msg, (returnVM, replaceVM)) + self.afterJobExecution(hdrfile, msg, returnVM) return # @@ -296,4 +286,4 @@ def run(self): if self.preVM and not vm: vm = self.job.vm = self.preVM if vm: - self.detachVM(return_vm=False, replace_vm=True) + self.detachVM(return_vm=False) From 721d6c2bea6c4c42852150390583bbf2e6ca0a7c Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 16 Jul 2018 16:13:10 -0400 Subject: [PATCH 100/131] destroyVM doesn't return a value for ohter vmms modules. remove from tashi. --- vmms/tashiSSH.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vmms/tashiSSH.py b/vmms/tashiSSH.py index ea05e114..af9c567a 100644 --- a/vmms/tashiSSH.py +++ b/vmms/tashiSSH.py @@ -337,7 +337,8 @@ def destroyVM(self, vm): """ destroyVM - Removes a VM from the system """ ret = self.tashiCall("destroyVm", [vm.instance_id]) - return ret + self.log.debug("Destroying VM %s status %s" % (vm.instance_id, ret)) + return def safeDestroyVM(self, vm): """ safeDestroyVM - More robust version of destroyVM. From 1ffa4aea587ed2536b9410e209a9b0083d8b93f5 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 16 Jul 2018 16:14:15 -0400 Subject: [PATCH 101/131] Rework exception catch for vmms/ec2 module. --- vmms/ec2SSH.py | 47 +++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index bbcc0cf9..57019bec 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -89,17 +89,24 @@ def __init__(self, accessKeyId=None, accessKey=None, ec2User=None): self.ec2User = ec2User if ec2User else config.Config.EC2_USER_NAME self.useDefaultKeyPair = False if accessKeyId else True - self.boto3client = boto3.client("ec2", config.Config.EC2_REGION, - aws_access_key_id=accessKeyId, - aws_secret_access_key=accessKey) - self.boto3resource = boto3.resource("ec2", config.Config.EC2_REGION) + self.img2ami = {} + images = [] + + try: + self.boto3client = boto3.client("ec2", config.Config.EC2_REGION, + aws_access_key_id=accessKeyId, + aws_secret_access_key=accessKey) + self.boto3resource = boto3.resource("ec2", config.Config.EC2_REGION) + + images = self.boto3resource.images.filter(Owners=["self"]) + except Exception as e: + self.log.error("Ec2SSH init Failed: %s"% e) + raise # serious error # Note: By convention, all usable images to Tango must have "Name" tag # in the form of xyz.img which is the VM image in Autolab for an assignment. # xyz is also the preallocator pool name for vms using this image. - self.img2ami = {} - images = self.boto3resource.images.filter(Owners=["self"]) for image in images: if image.tags: for tag in image.tags: @@ -209,6 +216,7 @@ def createSecurityGroup(self): self.boto3resource.authorize_security_group_ingress( GroupId=security_group_id) except ClientError as e: + # security group may have been created already pass # @@ -299,7 +307,7 @@ def initializeVM(self, vm): return vm except Exception as e: - self.log.debug("initializeVM Failed: %s" % e) + self.log.error("initializeVM Failed: %s" % e) if newInstance: self.boto3resource.instances.filter(InstanceIds=[newInstance.id]).terminate() return None @@ -464,11 +472,13 @@ def copyOut(self, vm, destFile): config.Config.COPYOUT_TIMEOUT) def destroyVM(self, vm): - """ destroyVM - Removes a VM from the system - """ + """ destroyVM - Removes a VM from the system + """ - self.log.info("destroyVM: %s %s %s %s" % (vm.ec2_id, vm.name, vm.keepForDebugging, vm.notes)) + self.log.info("destroyVM: %s %s %s %s" % + (vm.ec2_id, vm.name, vm.keepForDebugging, vm.notes)) + try: # Keep the vm and mark with meaningful tags for debugging if hasattr(config.Config, 'KEEP_VM_AFTER_FAILURE') and \ config.Config.KEEP_VM_AFTER_FAILURE and vm.keepForDebugging: @@ -484,12 +494,14 @@ def destroyVM(self, vm): instance.create_tags(Tags=[{"Key": "Notes", "Value": vm.notes}]) return - ret = self.boto3resource.instances.filter(InstanceIds=[vm.ec2_id]).terminate() + self.boto3resource.instances.filter(InstanceIds=[vm.ec2_id]).terminate() # delete dynamically created key if not self.useDefaultKeyPair: self.deleteKeyPair() - return ret + except Exception as e: + self.log.error("destroyVM init Failed: %s for vm %s" % (e, vm.ec2_id)) + pass def safeDestroyVM(self, vm): return self.destroyVM(vm) @@ -503,10 +515,11 @@ def getTag(self, tagList, tagKey): return None def getVMs(self): - """ getVMs - Returns the running or pending VMs on this account. Each - list entry is a boto.ec2.instance.Instance object. - """ + """ getVMs - Returns the running or pending VMs on this account. Each + list entry is a boto.ec2.instance.Instance object. + """ + try: vms = list() filters=[{'Name': 'instance-state-name', 'Values': ['running', 'pending']}] @@ -534,9 +547,11 @@ def getVMs(self): vms.append(vm) return vms + except Exception as e: + self.log.debug("getVMs Failed: %s" % e) def existsVM(self, vm): - """ existsVM - Checks whether a VM exists in the vmms. + """ existsVM - Checks whether a VM exists in the vmms. Internal use. """ filters=[{'Name': 'instance-state-name', 'Values': ['running']}] From 2ba0f33fcec535472c97c3ae52f2a34db5415dc5 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 17 Jul 2018 16:33:17 -0400 Subject: [PATCH 102/131] reindent to use 4 spaces. --- vmms/ec2SSH.py | 240 ++++++++++++++++++++++++------------------------- 1 file changed, 120 insertions(+), 120 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 57019bec..8ee68913 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -93,42 +93,42 @@ def __init__(self, accessKeyId=None, accessKey=None, ec2User=None): images = [] try: - self.boto3client = boto3.client("ec2", config.Config.EC2_REGION, - aws_access_key_id=accessKeyId, - aws_secret_access_key=accessKey) - self.boto3resource = boto3.resource("ec2", config.Config.EC2_REGION) + self.boto3client = boto3.client("ec2", config.Config.EC2_REGION, + aws_access_key_id=accessKeyId, + aws_secret_access_key=accessKey) + self.boto3resource = boto3.resource("ec2", config.Config.EC2_REGION) - images = self.boto3resource.images.filter(Owners=["self"]) + images = self.boto3resource.images.filter(Owners=["self"]) except Exception as e: - self.log.error("Ec2SSH init Failed: %s"% e) - raise # serious error + self.log.error("Ec2SSH init Failed: %s"% e) + raise # serious error # Note: By convention, all usable images to Tango must have "Name" tag # in the form of xyz.img which is the VM image in Autolab for an assignment. # xyz is also the preallocator pool name for vms using this image. for image in images: - if image.tags: - for tag in image.tags: - if tag["Key"] == "Name": - if tag["Value"] and tag["Value"].endswith(".img"): - if tag["Value"] in self.img2ami: - self.log.info("Ignore %s for duplicate name tag %s" % - (image.id, tag["Value"])) - else: - self.img2ami[tag["Value"]] = image - self.log.info("Found image: %s with name tag %s" % - (image.id, tag["Value"])) - elif tag["Value"]: - self.log.info("Ignore %s with ill-formed name tag %s" % - (image.id, tag["Value"])) + if image.tags: + for tag in image.tags: + if tag["Key"] == "Name": + if tag["Value"] and tag["Value"].endswith(".img"): + if tag["Value"] in self.img2ami: + self.log.info("Ignore %s for duplicate name tag %s" % + (image.id, tag["Value"])) + else: + self.img2ami[tag["Value"]] = image + self.log.info("Found image: %s with name tag %s" % + (image.id, tag["Value"])) + elif tag["Value"]: + self.log.info("Ignore %s with ill-formed name tag %s" % + (image.id, tag["Value"])) imageAMIs = [item.id for item in images] taggedAMIs = [self.img2ami[key].id for key in self.img2ami] ignoredAMIs = list(set(imageAMIs) - set(taggedAMIs)) if (len(ignoredAMIs) > 0): - self.log.info("Ignored images %s for lack of or ill-formed name tag" % - str(ignoredAMIs)) + self.log.info("Ignored images %s for lack of or ill-formed name tag" % + str(ignoredAMIs)) # # VMMS helper methods @@ -268,28 +268,28 @@ def initializeVM(self, vm): # Wait for instance to reach 'running' state start_time = time.time() while True: - # Note: You'd think we should be able to read the state from the - # instance but that turns out not working. So we round up all - # running intances and find our instance by instance id - - filters=[{'Name': 'instance-state-name', 'Values': ['running']}] - instances = self.boto3resource.instances.filter(Filters=filters) - instanceRunning = False - - newInstance.load() # reload the state of the instance - for inst in instances.filter(InstanceIds=[newInstance.id]): - self.log.debug("VM %s: is running %s" % (instanceName, newInstance.id)) - instanceRunning = True - - if instanceRunning: - break - - if time.time() - start_time > config.Config.INITIALIZEVM_TIMEOUT: - raise ValueError("VM %s: timeout (%d seconds) before reaching 'running' state" % - (instanceName, config.Config.TIMER_POLL_INTERVAL)) - - self.log.debug("VM %s: Waiting to reach 'running' from 'pending'" % instanceName) - time.sleep(config.Config.TIMER_POLL_INTERVAL) + # Note: You'd think we should be able to read the state from the + # instance but that turns out not working. So we round up all + # running intances and find our instance by instance id + + filters=[{'Name': 'instance-state-name', 'Values': ['running']}] + instances = self.boto3resource.instances.filter(Filters=filters) + instanceRunning = False + + newInstance.load() # reload the state of the instance + for inst in instances.filter(InstanceIds=[newInstance.id]): + self.log.debug("VM %s: is running %s" % (instanceName, newInstance.id)) + instanceRunning = True + + if instanceRunning: + break + + if time.time() - start_time > config.Config.INITIALIZEVM_TIMEOUT: + raise ValueError("VM %s: timeout (%d seconds) before reaching 'running' state" % + (instanceName, config.Config.TIMER_POLL_INTERVAL)) + + self.log.debug("VM %s: Waiting to reach 'running' from 'pending'" % instanceName) + time.sleep(config.Config.TIMER_POLL_INTERVAL) # end of while loop self.log.info( @@ -309,7 +309,7 @@ def initializeVM(self, vm): except Exception as e: self.log.error("initializeVM Failed: %s" % e) if newInstance: - self.boto3resource.instances.filter(InstanceIds=[newInstance.id]).terminate() + self.boto3resource.instances.filter(InstanceIds=[newInstance.id]).terminate() return None def waitVM(self, vm, max_secs): @@ -417,10 +417,10 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): maxOutputFileSize) if hasattr(config.Config, 'AUTODRIVER_LOGGING_TIME_ZONE') and \ config.Config.AUTODRIVER_LOGGING_TIME_ZONE: - runcmd = runcmd + ("-z %s " % config.Config.AUTODRIVER_LOGGING_TIME_ZONE) + runcmd = runcmd + ("-z %s " % config.Config.AUTODRIVER_LOGGING_TIME_ZONE) if hasattr(config.Config, 'AUTODRIVER_TIMESTAMP_INTERVAL') and \ config.Config.AUTODRIVER_TIMESTAMP_INTERVAL: - runcmd = runcmd + ("-i %d " % config.Config.AUTODRIVER_TIMESTAMP_INTERVAL) + runcmd = runcmd + ("-i %d " % config.Config.AUTODRIVER_TIMESTAMP_INTERVAL) runcmd = runcmd + "autolab &> output" # runTimeout * 2 is a conservative estimate. @@ -472,83 +472,83 @@ def copyOut(self, vm, destFile): config.Config.COPYOUT_TIMEOUT) def destroyVM(self, vm): - """ destroyVM - Removes a VM from the system - """ - - self.log.info("destroyVM: %s %s %s %s" % - (vm.ec2_id, vm.name, vm.keepForDebugging, vm.notes)) - - try: - # Keep the vm and mark with meaningful tags for debugging - if hasattr(config.Config, 'KEEP_VM_AFTER_FAILURE') and \ - config.Config.KEEP_VM_AFTER_FAILURE and vm.keepForDebugging: - iName = self.instanceName(vm.id, vm.name) - self.log.info("Will keep VM %s for further debugging" % iName) - instance = self.boto3resource.Instance(vm.ec2_id) - # delete original name tag and replace it with "failed-xyz" - # add notes tag for test name - tag = self.boto3resource.Tag(vm.ec2_id, "Name", iName) - if tag: - tag.delete() - instance.create_tags(Tags=[{"Key": "Name", "Value": "failed-" + iName}]) - instance.create_tags(Tags=[{"Key": "Notes", "Value": vm.notes}]) - return - - self.boto3resource.instances.filter(InstanceIds=[vm.ec2_id]).terminate() - # delete dynamically created key - if not self.useDefaultKeyPair: - self.deleteKeyPair() - - except Exception as e: - self.log.error("destroyVM init Failed: %s for vm %s" % (e, vm.ec2_id)) - pass + """ destroyVM - Removes a VM from the system + """ + + self.log.info("destroyVM: %s %s %s %s" % + (vm.ec2_id, vm.name, vm.keepForDebugging, vm.notes)) + + try: + # Keep the vm and mark with meaningful tags for debugging + if hasattr(config.Config, 'KEEP_VM_AFTER_FAILURE') and \ + config.Config.KEEP_VM_AFTER_FAILURE and vm.keepForDebugging: + iName = self.instanceName(vm.id, vm.name) + self.log.info("Will keep VM %s for further debugging" % iName) + instance = self.boto3resource.Instance(vm.ec2_id) + # delete original name tag and replace it with "failed-xyz" + # add notes tag for test name + tag = self.boto3resource.Tag(vm.ec2_id, "Name", iName) + if tag: + tag.delete() + instance.create_tags(Tags=[{"Key": "Name", "Value": "failed-" + iName}]) + instance.create_tags(Tags=[{"Key": "Notes", "Value": vm.notes}]) + return + + self.boto3resource.instances.filter(InstanceIds=[vm.ec2_id]).terminate() + # delete dynamically created key + if not self.useDefaultKeyPair: + self.deleteKeyPair() + + except Exception as e: + self.log.error("destroyVM init Failed: %s for vm %s" % (e, vm.ec2_id)) + pass def safeDestroyVM(self, vm): return self.destroyVM(vm) # return None or tag value if key exists def getTag(self, tagList, tagKey): - if tagList: - for tag in tagList: - if tag["Key"] == tagKey: - return tag["Value"] - return None + if tagList: + for tag in tagList: + if tag["Key"] == tagKey: + return tag["Value"] + return None def getVMs(self): - """ getVMs - Returns the running or pending VMs on this account. Each - list entry is a boto.ec2.instance.Instance object. - """ - - try: - vms = list() - filters=[{'Name': 'instance-state-name', 'Values': ['running', 'pending']}] - - for inst in self.boto3resource.instances.filter(Filters=filters): - vm = TangoMachine() # make a Tango internal vm structure - vm.ec2_id = inst.id - vm.id = None # the serial number as in inst name PREFIX-serial-IMAGE - vm.domain_name = None - - instName = self.getTag(inst.tags, "Name") - # Name tag is the standard form of prefix-serial-image - if instName and re.match("%s-" % config.Config.PREFIX, instName): - vm.id = int(instName.split("-")[1]) - vm.name = instName.split("-")[2] - elif not instName: - vm.name = "Instance_id_" + inst.id + "_without_name_tag" - else: - vm.name = instName - - if inst.public_ip_address: - vm.domain_name = inst.public_ip_address - - self.log.debug('getVMs: Instance id %s, pool %s, vm id %s' % - (vm.ec2_id, vm.name, vm.id)) - vms.append(vm) - - return vms - except Exception as e: - self.log.debug("getVMs Failed: %s" % e) + """ getVMs - Returns the running or pending VMs on this account. Each + list entry is a boto.ec2.instance.Instance object. + """ + + try: + vms = list() + filters=[{'Name': 'instance-state-name', 'Values': ['running', 'pending']}] + + for inst in self.boto3resource.instances.filter(Filters=filters): + vm = TangoMachine() # make a Tango internal vm structure + vm.ec2_id = inst.id + vm.id = None # the serial number as in inst name PREFIX-serial-IMAGE + vm.domain_name = None + + instName = self.getTag(inst.tags, "Name") + # Name tag is the standard form of prefix-serial-image + if instName and re.match("%s-" % config.Config.PREFIX, instName): + vm.id = int(instName.split("-")[1]) + vm.name = instName.split("-")[2] + elif not instName: + vm.name = "Instance_id_" + inst.id + "_without_name_tag" + else: + vm.name = instName + + if inst.public_ip_address: + vm.domain_name = inst.public_ip_address + + self.log.debug('getVMs: Instance id %s, pool %s, vm id %s' % + (vm.ec2_id, vm.name, vm.id)) + vms.append(vm) + + return vms + except Exception as e: + self.log.debug("getVMs Failed: %s" % e) def existsVM(self, vm): """ existsVM - Checks whether a VM exists in the vmms. Internal use. @@ -557,12 +557,12 @@ def existsVM(self, vm): filters=[{'Name': 'instance-state-name', 'Values': ['running']}] instances = self.boto3resource.instances.filter(Filters=filters) for inst in instances.filter(InstanceIds=[vm.ec2_id]): - self.log.debug("VM %s: exists and running" % vm.ec2_id) - return True + self.log.debug("VM %s: exists and running" % vm.ec2_id) + return True return False def getImages(self): - """ getImages - return a constant; actually use the ami specified in config + """ getImages - return a constant; actually use the ami specified in config """ self.log.info("getImages: %s" % str(list(self.img2ami.keys()))) return list(self.img2ami.keys()) From 9028de04412398707859a3332eae6259c424f6d9 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 31 Jul 2018 18:54:37 -0400 Subject: [PATCH 103/131] Tell autolab web server Tango's timezone and offset from UTC. --- tango.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tango.py b/tango.py index f0394e01..1f754f0f 100755 --- a/tango.py +++ b/tango.py @@ -203,6 +203,9 @@ def getInfo(self): stats['runjob_errors'] = Config.runjob_errors stats['copyout_errors'] = Config.copyout_errors stats['num_threads'] = threading.activeCount() + stats['timezone_offset'] = time.altzone + (zone, daylight) = time.tzname + stats['timezone_name'] = zone + ("" if not daylight else ("/" + daylight)) return stats From a068e1545199acc24f97759b926b0cea8f0eb4d7 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 6 Aug 2018 16:18:14 -0400 Subject: [PATCH 104/131] Add OVERRIDE_INST_TYPE to force the aws instance type when necessary. --- config.template.py | 1 + vmms/ec2SSH.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/config.template.py b/config.template.py index 2ed3b115..8a6734d9 100644 --- a/config.template.py +++ b/config.template.py @@ -166,6 +166,7 @@ class Config: EC2_REGION = '' EC2_USER_NAME = '' KEEP_VM_AFTER_FAILURE = False + OVERRIDE_INST_TYPE = '' # force instance type, if defined DEFAULT_INST_TYPE = '' DEFAULT_SECURITY_GROUP = '' SECURITY_KEY_PATH = '' diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 8ee68913..fe13837f 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -162,7 +162,10 @@ def tangoMachineToEC2Instance(self, vm): memory = vm.memory # in Kbytes cores = vm.cores - if (cores == 1 and memory <= 613 * 1024): + if hasattr(config.Config, 'OVERRIDE_INST_TYPE') and \ + config.Config.OVERRIDE_INST_TYPE: + ec2instance['instance_type'] = config.Config.OVERRIDE_INST_TYPE + elif (cores == 1 and memory <= 613 * 1024): ec2instance['instance_type'] = 't2.micro' elif (cores == 1 and memory <= 1.7 * 1024 * 1024): ec2instance['instance_type'] = 'm1.small' From da0450388072a2a35d45ff68fa5d7d3fc4c1791e Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 13 Aug 2018 12:27:21 -0400 Subject: [PATCH 105/131] Add ability to create aws instance with specified instance type. --- config.template.py | 1 - restful-tango/tangoREST.py | 4 +-- tango.py | 5 ---- tangoObjects.py | 37 +++++++++++++++++------- tools/ec2Read.py | 58 ++++++++++++++++++++++++++------------ vmms/ec2SSH.py | 17 +++++------ 6 files changed, 76 insertions(+), 46 deletions(-) diff --git a/config.template.py b/config.template.py index 8a6734d9..2ed3b115 100644 --- a/config.template.py +++ b/config.template.py @@ -166,7 +166,6 @@ class Config: EC2_REGION = '' EC2_USER_NAME = '' KEEP_VM_AFTER_FAILURE = False - OVERRIDE_INST_TYPE = '' # force instance type, if defined DEFAULT_INST_TYPE = '' DEFAULT_SECURITY_GROUP = '' SECURITY_KEY_PATH = '' diff --git a/restful-tango/tangoREST.py b/restful-tango/tangoREST.py index 602ee885..386dcdcf 100644 --- a/restful-tango/tangoREST.py +++ b/restful-tango/tangoREST.py @@ -120,7 +120,6 @@ def createTangoMachine(self, image, vmms=Config.VMMS_NAME, """ createTangoMachine - Creates a tango machine object from image """ return TangoMachine( - name=image, vmms=vmms, image="%s" % (image), cores=vmObj["cores"], @@ -193,6 +192,7 @@ def convertTangoMachineObj(self, tangoMachine): vm['disk'] = tangoMachine.disk vm['id'] = tangoMachine.id vm['name'] = tangoMachine.name + vm['instance_type'] = tangoMachine.instance_type return vm def convertInputFileObj(self, inputFile): @@ -428,7 +428,7 @@ def prealloc(self, key, image, num, vmStr): self.log.error("Invalid prealloc size") return self.status.invalid_prealloc_size if ret == -3: - self.log.error("Invalid image name") + self.log.error("Invalid image name: %s" % image) return self.status.invalid_image self.log.info("Successfully preallocated VMs") return self.status.preallocated diff --git a/tango.py b/tango.py index 1f754f0f..31638d1f 100755 --- a/tango.py +++ b/tango.py @@ -139,8 +139,6 @@ def preallocVM(self, vm, num): if vm.image not in vmms.getImages(): self.log.error("Invalid image name") return -3 - (name, ext) = os.path.splitext(vm.image) - vm.name = name self.preallocator.update(vm, num) return 0 except Exception as err: @@ -321,9 +319,6 @@ def __validateJob(self, job, vmms): job.appendTrace("validateJob: Image not found: %s" % job.vm.image) errors += 1 - else: - (name, ext) = os.path.splitext(job.vm.image) - job.vm.name = name if not job.vm.vmms: self.log.error("validateJob: Missing job.vm.vmms") diff --git a/tangoObjects.py b/tangoObjects.py index 8f41dfa4..29ef04d9 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -2,6 +2,7 @@ # # Implements objects used to pass state within Tango. # +import os import redis import pickle import Queue @@ -47,26 +48,42 @@ class TangoMachine(): TangoMachine - A description of the Autograding Virtual Machine """ - def __init__(self, name="DefaultTestVM", image=None, vmms=None, + def __init__(self, image=None, vmms=None, network=None, cores=None, memory=None, disk=None, - domain_name=None, ec2_id=None, resume=None, id=None, - instance_id=None): - self.name = name + domain_name=None): self.image = image + self.vmms = vmms self.network = network self.cores = cores self.memory = memory self.disk = disk - self.vmms = vmms self.domain_name = domain_name - self.ec2_id = ec2_id - self.resume = resume - self.id = id - self.instance_id = id + + self.ec2_id = None + self.resume = None + self.id = None + self.instance_id = None + self.instance_type = None + self.notes = None + # The following attributes can instruct vmms to set the test machine # aside for further investigation. self.keepForDebugging = False - self.notes = None + + # The "name" property is vmms dependent. It doesn't mean the name of + # vm. It's derived from the image name and used as the vm pool name + # The actual vm name is constructed by the instanceName method in each vmms. + self.name = None + + # The image may contain instance type if vmms is ec2. Example: + # course101+t2.small. + if image: + imageParts = image.split('+') + if len(imageParts) == 2: + self.image = imageParts[0] + self.instance_type = imageParts[1] + (name, ext) = os.path.splitext(self.image) + self.name = name + ("+" + self.instance_type if self.instance_type else "") def __repr__(self): return "TangoMachine(image: %s, vmms: %s, id: %s)" % (self.image, self.vmms, self.id) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index a4474d8c..9a531f0c 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -23,7 +23,7 @@ def __init__(self): parser.add_argument('-a', '--accessIdKeyUser', help="aws access id, key and user, space separated") parser.add_argument('-c', '--createVMs', action='store_true', - dest='createVMs', help="create a VM for each pool") + dest='createVMs', help="add a VM for each pool") parser.add_argument('-d', '--destroyVMs', action='store_true', dest='destroyVMs', help="destroy VMs and empty pools") parser.add_argument('-D', '--instanceNameTags', nargs='+', @@ -47,24 +47,24 @@ def destroyVMs(): print "number of Tango VMs:", len(vms) for vm in vms: if vm.id: - print "destroy", ec2.instanceName(vm.id, vm.name) + print "destroy", nameToPrint(ec2.instanceName(vm.id, vm.name)) ec2.destroyVM(vm) else: - print "VM not in Tango naming pattern:", vm.name + print "VM not in Tango naming pattern:", nameToPrint(vm.name) def pingVMs(): vms = ec2.getVMs() print "number of Tango VMs:", len(vms) for vm in vms: if vm.id: - print "ping", ec2.instanceName(vm.id, vm.name) + print "ping", nameToPrint(ec2.instanceName(vm.id, vm.name)) # Note: following call needs the private key file for aws to be # at wherever SECURITY_KEY_PATH in config.py points to. # For example, if SECURITY_KEY_PATH = '/root/746-autograde.pem', # then the file should exist there. ec2.waitVM(vm, Config.WAITVM_TIMEOUT) else: - print "VM not in Tango naming pattern:", vm.name + print "VM not in Tango naming pattern:", nameToPrint(vm.name) local_tz = pytz.timezone("EST") def utc_to_local(utc_dt): @@ -120,12 +120,12 @@ def listInstances(all=None): launchTime = utc_to_local(instance.launch_time) if instance.public_ip_address: print("%s: %s %s %s %s" % - (item["Name"], instance.id, + (nameToPrint(item["Name"]), instance.id, launchTime, instance.state["Name"], instance.public_ip_address)) else: print("%s: %s %s %s" % - (item["Name"], instance.id, + (nameToPrint(item["Name"]), instance.id, launchTime, instance.state["Name"])) if instance.tags: @@ -135,8 +135,8 @@ def listInstances(all=None): else: print("\t No tags") - """ useful sometimes print "\t InstanceType:", instance.instance_type + """ useful sometimes image = boto3resource.Image(instance.image_id) print "\t ImageId:", image.image_id for tag in image.tags: @@ -146,22 +146,44 @@ def listInstances(all=None): return nameAndInstances def listPools(): - print "Tango VM pools by AWS image", ec2.img2ami.keys() - for key in server.preallocator.machines.keys(): + print "known AWS images:", ec2.img2ami.keys() + knownPools = server.preallocator.machines.keys() + print "Tango VM pools:", "" if knownPools else "None" + + for key in knownPools: pool = server.preallocator.getPool(key) totalPool = pool["total"] freePool = pool["free"] totalPool.sort() freePool.sort() - print "pool", key, "total", len(totalPool), totalPool, freePool + print "pool", nameToPrint(key), "total", len(totalPool), totalPool, freePool + +def nameToPrint(name): + return "[" + name + "]" if name else "[None]" # allocate "num" vms for each and every pool (image) -def createVMs(num): - for imageName in pools: - (poolName, ext) = os.path.splitext(imageName) - print "creating", num, "for pool", poolName - vm = TangoMachine(vmms="ec2SSH", image=imageName) - server.preallocVM(vm, num) +def addVMs(): + # Add a vm for each image and a vm for the first image plus instance type + instanceTypeTried = False + for key in ec2.img2ami.keys(): + vm = TangoMachine(vmms="ec2SSH", image=key) + pool = server.preallocator.getPool(vm.name) + currentCount = len(pool["total"]) if pool else 0 + print "adding a vm into pool", nameToPrint(vm.name) + print "pool", nameToPrint(vm.name), "current size", currentCount + server.preallocVM(vm, currentCount + 1) + + if instanceTypeTried: + continue + else: + instanceTypeTried = True + + vm = TangoMachine(vmms="ec2SSH", image=key+"+t2.small") + pool = server.preallocator.getPool(vm.name) + currentCount = len(pool["total"]) if pool else 0 + print "pool", nameToPrint(vm.name), "current size", currentCount + print "adding a vm into pool", nameToPrint(vm.name) + server.preallocVM(vm, currentCount + 1) def destroyRedisPools(): for key in server.preallocator.machines.keys(): @@ -247,7 +269,7 @@ def destroyRedisPools(): if argCreateVMs: listInstances() listPools() - createVMs(1) + addVMs() # add 1 vm for each image and each image plus instance type listInstances() listPools() exit() diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index fe13837f..5f9afe49 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -104,14 +104,15 @@ def __init__(self, accessKeyId=None, accessKey=None, ec2User=None): raise # serious error # Note: By convention, all usable images to Tango must have "Name" tag - # in the form of xyz.img which is the VM image in Autolab for an assignment. - # xyz is also the preallocator pool name for vms using this image. + # whose value is the image name, such as xyz or xyz.img (older form). + # xyz is also the preallocator pool name for vms using this image, if + # instance type is not specified. for image in images: if image.tags: for tag in image.tags: if tag["Key"] == "Name": - if tag["Value"] and tag["Value"].endswith(".img"): + if tag["Value"]: if tag["Value"] in self.img2ami: self.log.info("Ignore %s for duplicate name tag %s" % (image.id, tag["Value"])) @@ -119,9 +120,6 @@ def __init__(self, accessKeyId=None, accessKey=None, ec2User=None): self.img2ami[tag["Value"]] = image self.log.info("Found image: %s with name tag %s" % (image.id, tag["Value"])) - elif tag["Value"]: - self.log.info("Ignore %s with ill-formed name tag %s" % - (image.id, tag["Value"])) imageAMIs = [item.id for item in images] taggedAMIs = [self.img2ami[key].id for key in self.img2ami] @@ -162,9 +160,8 @@ def tangoMachineToEC2Instance(self, vm): memory = vm.memory # in Kbytes cores = vm.cores - if hasattr(config.Config, 'OVERRIDE_INST_TYPE') and \ - config.Config.OVERRIDE_INST_TYPE: - ec2instance['instance_type'] = config.Config.OVERRIDE_INST_TYPE + if vm.instance_type: + ec2instance['instance_type'] = vm.instance_type elif (cores == 1 and memory <= 613 * 1024): ec2instance['instance_type'] = 't2.micro' elif (cores == 1 and memory <= 1.7 * 1024 * 1024): @@ -180,7 +177,7 @@ def tangoMachineToEC2Instance(self, vm): else: ec2instance['instance_type'] = config.Config.DEFAULT_INST_TYPE - ec2instance['ami'] = self.img2ami[vm.name + ".img"].id + ec2instance['ami'] = self.img2ami[vm.image].id self.log.info("tangoMachineToEC2Instance: %s" % str(ec2instance)) return ec2instance From cdf8811bdc36e9a4a1cba711c9ee5cdfd01dd20d Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 16 Aug 2018 13:27:31 -0400 Subject: [PATCH 106/131] rework tango's reset code. It fails to remove existing vms on clean restart (i.e. when redis is restarted, too). --- preallocator.py | 20 ++++++++++++-------- tango.py | 31 +++++++++++-------------------- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/preallocator.py b/preallocator.py index 249ed0ce..ae821ab5 100644 --- a/preallocator.py +++ b/preallocator.py @@ -168,29 +168,32 @@ def addVM(self, vm): # that. To solve the problem cleanly, preallocator should provide ONE primitive # to add/remove a vm from both total and free pools, instead of two disjoint ones. def removeFromFreePool(self, vm): - dieVM = None self.lock.acquire() size = self.machines.get(vm.name)[1].qsize() self.log.info("removeFromFreePool: %s in pool %s" % (vm.id, vm.name)) for i in range(size): # go through free pool - vm = self.machines.get(vm.name)[1].get_nowait() + freeVM = self.machines.get(vm.name)[1].get_nowait() # put it back into free pool, if not our vm - if vm.id != id: - self.machines.get(vm.name)[1].put(vm) + if vm.id != freeVM.id: + self.machines.get(vm.name)[1].put(freeVM) else: self.log.info("removeFromFreePool: found %s in pool %s" % (vm.id, vm.name)) # don't put this particular vm back to free pool, that is removal self.lock.release() - def removeVM(self, vm): + # return True if the vm is in the pool (and removed) + def removeVM(self, vm, mustFind=True): """ removeVM - remove a particular VM instance from the pool """ self.lock.acquire() machine = self.machines.get(vm.name) - if vm.id not in machine[0]: - self.log.error("removeVM: %s NOT found in pool" % (vm.id, vm.name)) + if not machine or vm.id not in machine[0]: + if mustFind: + self.log.error("removeVM: %s NOT found in pool" % vm.name) + else: + self.log.info("removeVM: %s NOT found in pool. This is OK" % vm.name) self.lock.release() - return + return False self.log.info("removeVM: %s" % vm.id) machine[0].remove(vm.id) @@ -198,6 +201,7 @@ def removeVM(self, vm): self.lock.release() self.removeFromFreePool(vm) # also remove from free pool, just in case + return True def _getNextID(self): """ _getNextID - returns next ID to be used for a preallocated diff --git a/tango.py b/tango.py index 31638d1f..413aae51 100755 --- a/tango.py +++ b/tango.py @@ -238,36 +238,27 @@ def resetTango(self, vmms): for key in self.preallocator.machines.keys(): freePool = self.preallocator.getPool(key)["free"] for vmId in freePool: - vmName = vobj.instanceName(vmId, key) - allFreeVMs.append(vmName) + allFreeVMs.append(vobj.instanceName(vmId, key)) self.log.info("vms in all free pools: %s" % allFreeVMs) + # allFreeVMs = [] + # For each in Tango's name space, destroy the onces in free pool. # AND remove it from Tango's internal bookkeeping. vms = vobj.getVMs() - self.log.debug("Pre-existing VMs: %s" % [vm.name for vm in vms]) + self.log.debug("Pre-existing VMs: %s" % + [vobj.instanceName(vm.id, vm.name) for vm in vms]) destroyedList = [] removedList = [] for vm in vms: - if re.match("%s-" % Config.PREFIX, vm.name): - - # Todo: should have an one-call interface to destroy the - # machine AND to keep the interval data consistent. - if vm.name not in allFreeVMs: - destroyedList.append(vm.name) + vmName = vobj.instanceName(vm.id, vm.name) + if re.match("%s-" % Config.PREFIX, vmName): + if vmName not in allFreeVMs: + destroyedList.append(vmName) + if self.preallocator.removeVM(vm, mustFind=False): + removedList.append(vmName) vobj.destroyVM(vm) - # also remove it from "total" set of the pool - (prefix, vmId, poolName) = vm.name.split("-") - machine = self.preallocator.machines.get(poolName) - if not machine: # the pool may not exist - continue - - if int(vmId) in machine[0]: - removedList.append(vm.name) - machine[0].remove(int(vmId)) - self.preallocator.machines.set(poolName, machine) - if destroyedList: self.log.warning("Killed these %s VMs on restart: %s" % (vmms_name, destroyedList)) From d6ee44d94cbb54e5ec7443a723cd707339bb85c5 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 16 Aug 2018 14:15:49 -0400 Subject: [PATCH 107/131] getVMs for ec2 should returns only vms that belongs to Tango (with prefix) --- vmms/ec2SSH.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 5f9afe49..fa81d1da 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -534,10 +534,8 @@ def getVMs(self): if instName and re.match("%s-" % config.Config.PREFIX, instName): vm.id = int(instName.split("-")[1]) vm.name = instName.split("-")[2] - elif not instName: - vm.name = "Instance_id_" + inst.id + "_without_name_tag" else: - vm.name = instName + continue # instance not belong Tango. Skip if inst.public_ip_address: vm.domain_name = inst.public_ip_address From fe00e97179ce419d7fe6a40a26e48b0a796a0780 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 22 Aug 2018 14:09:08 -0400 Subject: [PATCH 108/131] Rename TangoMachine's name property to pool. vmms/ec2 is done. --- jobManager.py | 12 ++-- jobQueue.py | 6 +- preallocator.py | 112 +++++++++++++++++++------------------ restful-tango/tangoREST.py | 1 + tango.py | 20 ++++--- tangoObjects.py | 11 ++-- tools/ec2Read.py | 16 +++--- vmms/ec2SSH.py | 85 +++++++++++++--------------- worker.py | 6 +- 9 files changed, 132 insertions(+), 137 deletions(-) diff --git a/jobManager.py b/jobManager.py index 7f4e1039..15fef281 100644 --- a/jobManager.py +++ b/jobManager.py @@ -52,7 +52,7 @@ def _getNextID(self): """ id = self.nextId self.nextId += 1 - # xxx simply wrap the id without guarding condition is bad. disable for now. + # xxxXXX??? simply wrap the id without guarding condition is bad. disable for now. # if self.nextId > 99999: # self.nextId = 10000 return id @@ -87,7 +87,7 @@ def __manage(self): from vmms.ec2SSH import Ec2SSH vmms = Ec2SSH(job.accessKeyId, job.accessKey) newVM = copy.deepcopy(job.vm) - newVM.id = self._getNextID() + newVM.id = self._getNextID() # xxxXXX??? try this path preVM = vmms.initializeVM(newVM) self.log.info("_manage init new vm %s" % preVM.id) else: @@ -97,17 +97,17 @@ def __manage(self): preVM = vm self.log.info("_manage use vm %s" % preVM.id) else: - # xxxXXX??? strongly suspect this code path not work. + # xxxXXX??? strongly suspect this code path doesn't work. # After setting REUSE_VMS to False, job submissions don't run. - preVM = self.preallocator.allocVM(job.vm.name) + preVM = self.preallocator.allocVM(job.vm.pool) self.log.info("_manage allocate vm %s" % preVM.id) vmms = self.vmms[job.vm.vmms] # Create new vmms object # Now dispatch the job to a worker self.log.info("Dispatched job %s:%d to %s [try %d]" % (job.name, job.id, preVM.name, job.retries)) - job.appendTrace("Dispatched job %s:%d [try %d]" % - (job.name, job.id, job.retries)) + job.appendTrace("Dispatched job %s:%d to %s [try %d]" % + (job.name, job.id, preVM.name, job.retries)) Worker( job, diff --git a/jobQueue.py b/jobQueue.py index 624f07ff..e3606cc5 100644 --- a/jobQueue.py +++ b/jobQueue.py @@ -209,8 +209,8 @@ def getNextPendingJobReuse(self, target_id=None): # Create or enlarge a pool if there is no free vm to use and # the limit for pool is not reached yet - if self.preallocator.freePoolSize(job.vm.name) == 0 and \ - self.preallocator.poolSize(job.vm.name) < Config.POOL_SIZE: + if self.preallocator.freePoolSize(job.vm.pool) == 0 and \ + self.preallocator.poolSize(job.vm.pool) < Config.POOL_SIZE: increment = 1 if hasattr(Config, 'POOL_ALLOC_INCREMENT') and Config.POOL_ALLOC_INCREMENT: increment = Config.POOL_ALLOC_INCREMENT @@ -219,7 +219,7 @@ def getNextPendingJobReuse(self, target_id=None): # If the job hasn't been assigned to a worker yet, see if there # is a free VM if (job.isNotAssigned()): - vm = self.preallocator.allocVM(job.vm.name) + vm = self.preallocator.allocVM(job.vm.pool) if vm: self.log.info("getNextPendingJobReuse alloc vm %s to job %s" % (vm, id)) self.queueLock.release() diff --git a/preallocator.py b/preallocator.py index ae821ab5..82bce04d 100644 --- a/preallocator.py +++ b/preallocator.py @@ -9,8 +9,8 @@ # # Preallocator - This class maintains a pool of active VMs for future # job requests. The pool is stored in dictionary called -# "machines". This structure keys off the name of the TangoMachine -# (.name). The values of this dictionary are two-element arrays: +# "machines". This structure keys off the pool of the TangoMachine +# (.pool). The values of this dictionary are two-element arrays: # Element 0 is the list of the IDs of the current VMs in this pool. # Element 1 is a queue of the VMs in this pool that are available to # be assigned to workers. @@ -48,14 +48,14 @@ def incrementPoolSize(self, vm, delta): """ self.lock.acquire() - if vm.name not in self.machines.keys(): - self.machines.set(vm.name, [[], TangoQueue(vm.name)]) + if vm.pool not in self.machines.keys(): + self.machines.set(vm.pool, [[], TangoQueue(vm.pool)]) # see comments in jobManager.py for the same call - self.machines.get(vm.name)[1].make_empty() - self.log.debug("Creating empty pool of %s instances" % (vm.name)) + self.machines.get(vm.pool)[1].make_empty() + self.log.debug("Creating empty pool of %s instances" % (vm.pool)) self.lock.release() - self.log.debug("incrementPoolSize: add %d new %s instances" % (delta, vm.name)) + self.log.debug("incrementPoolSize: add %d new vms to pool %s" % (delta, vm.pool)) threading.Thread(target=self.__create(vm, delta)).start() def update(self, vm, num): @@ -68,25 +68,25 @@ def update(self, vm, num): of machines as necessary. """ self.lock.acquire() - if vm.name not in self.machines.keys(): - self.machines.set(vm.name, [[], TangoQueue(vm.name)]) + if vm.pool not in self.machines.keys(): + self.machines.set(vm.pool, [[], TangoQueue(vm.pool)]) # see comments in jobManager.py for the same call - self.machines.get(vm.name)[1].make_empty() - self.log.debug("Creating empty pool of %s instances" % (vm.name)) + self.machines.get(vm.pool)[1].make_empty() + self.log.debug("Creating empty pool %s" % (vm.pool)) self.lock.release() - delta = num - len(self.machines.get(vm.name)[0]) + delta = num - len(self.machines.get(vm.pool)[0]) if delta > 0: # We need more self.machines, spin them up. self.log.debug( - "update: Creating %d new %s instances" % (delta, vm.name)) + "update: Creating %d new vms in pool %s" % (delta, vm.pool)) threading.Thread(target=self.__create(vm, delta)).start() elif delta < 0: # We have too many self.machines, remove them from the pool self.log.debug( - "update: Destroying %d preallocated %s instances" % - (-delta, vm.name)) + "update: Destroying %d preallocated vms in pool %s" % + (-delta, vm.pool)) for i in range(-1 * delta): threading.Thread(target=self.__destroy(vm)).start() @@ -105,6 +105,7 @@ def allocVM(self, vmName): self.lock.release() # If we're not reusing instances, then crank up a replacement + # xxxXXX??? test this code path if vm and not Config.REUSE_VMS: threading.Thread(target=self.__create(vm, 1)).start() @@ -115,10 +116,10 @@ def addToFreePool(self, vm): """ self.lock.acquire() - machine = self.machines.get(vm.name) - self.log.info("addToFreePool: add %s to free pool" % vm.id) + machine = self.machines.get(vm.pool) + self.log.info("addToFreePool: add vm %s to free pool" % vm.name) machine[1].put(vm) - self.machines.set(vm.name, machine) + self.machines.set(vm.pool, machine) self.lock.release() def freeVM(self, vm): @@ -129,20 +130,20 @@ def freeVM(self, vm): not_found = False should_destroy = False self.lock.acquire() - if vm and vm.id in self.machines.get(vm.name)[0]: + if vm and vm.id in self.machines.get(vm.pool)[0]: if (hasattr(Config, 'POOL_SIZE_LOW_WATER_MARK') and Config.POOL_SIZE_LOW_WATER_MARK >= 0 and - vm.name in self.machines.keys() and - self.freePoolSize(vm.name) >= Config.POOL_SIZE_LOW_WATER_MARK): + vm.pool in self.machines.keys() and + self.freePoolSize(vm.pool) >= Config.POOL_SIZE_LOW_WATER_MARK): self.log.info("freeVM: over low water mark. will destroy %s" % vm.id) should_destroy = True else: - machine = self.machines.get(vm.name) + machine = self.machines.get(vm.pool) self.log.info("freeVM: return %s to free pool" % vm.id) machine[1].put(vm) - self.machines.set(vm.name, machine) + self.machines.set(vm.pool, machine) else: - self.log.info("freeVM: not found in pool %s. will destroy %s" % (vm.name, vm.id)) + self.log.info("freeVM: %s not found in pool. Will destroy" % vm.name) not_found = True self.lock.release() @@ -157,10 +158,10 @@ def addVM(self, vm): """ addVM - add a particular VM instance to the pool """ self.lock.acquire() - machine = self.machines.get(vm.name) + machine = self.machines.get(vm.pool) machine[0].append(vm.id) - self.log.info("addVM: add %s" % vm.id) - self.machines.set(vm.name, machine) + self.log.info("addVM: add vm %s" % vm.name) + self.machines.set(vm.pool, machine) self.lock.release() # Note: This function is called from removeVM() to handle the case when a vm @@ -169,15 +170,15 @@ def addVM(self, vm): # to add/remove a vm from both total and free pools, instead of two disjoint ones. def removeFromFreePool(self, vm): self.lock.acquire() - size = self.machines.get(vm.name)[1].qsize() - self.log.info("removeFromFreePool: %s in pool %s" % (vm.id, vm.name)) + size = self.machines.get(vm.pool)[1].qsize() + self.log.info("removeFromFreePool: %s" % vm.name) for i in range(size): # go through free pool - freeVM = self.machines.get(vm.name)[1].get_nowait() + freeVM = self.machines.get(vm.pool)[1].get_nowait() # put it back into free pool, if not our vm if vm.id != freeVM.id: - self.machines.get(vm.name)[1].put(freeVM) + self.machines.get(vm.pool)[1].put(freeVM) else: - self.log.info("removeFromFreePool: found %s in pool %s" % (vm.id, vm.name)) + self.log.info("removeFromFreePool: found %s in pool" % vm.name) # don't put this particular vm back to free pool, that is removal self.lock.release() @@ -186,7 +187,7 @@ def removeVM(self, vm, mustFind=True): """ removeVM - remove a particular VM instance from the pool """ self.lock.acquire() - machine = self.machines.get(vm.name) + machine = self.machines.get(vm.pool) if not machine or vm.id not in machine[0]: if mustFind: self.log.error("removeVM: %s NOT found in pool" % vm.name) @@ -195,9 +196,9 @@ def removeVM(self, vm, mustFind=True): self.lock.release() return False - self.log.info("removeVM: %s" % vm.id) + self.log.info("removeVM: %s" % vm.name) machine[0].remove(vm.id) - self.machines.set(vm.name, machine) + self.machines.set(vm.pool, machine) self.lock.release() self.removeFromFreePool(vm) # also remove from free pool, just in case @@ -213,6 +214,7 @@ def _getNextID(self): self.nextID.increment() + # xxxXXX??? shouldn't reset if self.nextID.get() > 9999: self.nextID.set(1000) @@ -231,18 +233,17 @@ def __create(self, vm, cnt): for i in range(cnt): newVM = copy.deepcopy(vm) newVM.id = self._getNextID() - self.log.debug("__create|calling initializeVM") + self.log.debug("__create|calling initializeVM with id %d" % newVM.id) ret = vmms.initializeVM(newVM) if not ret: # ret is None when fails - self.log.debug("__create|failed initializeVM") + self.log.debug("__create|failed initializeVM with id %d" % newVM.id) continue - self.log.debug("__create|done with initializeVM") + self.log.debug("__create|done initializeVM with id %d" % newVM.id) time.sleep(Config.CREATEVM_SECS) self.addVM(newVM) self.addToFreePool(newVM) - self.log.debug("__create: Added vm %s to pool %s " % - (newVM.id, newVM.name)) + self.log.debug("__create: Added vm %s to pool" % newVM.name) def __destroy(self, vm): """ __destroy - Removes a VM from the pool @@ -254,11 +255,11 @@ def __destroy(self, vm): the free list is empty. """ self.lock.acquire() - dieVM = self.machines.get(vm.name)[1].get_nowait() + dieVM = self.machines.get(vm.pool)[1].get_nowait() self.lock.release() if dieVM: - self.log.info("__destroy: %s" % dieVM.id) + self.log.info("__destroy: %s" % dieVM.name) self.removeVM(dieVM) vmms = self.vmms[vm.vmms] vmms.safeDestroyVM(dieVM) @@ -275,15 +276,15 @@ def createVM(self, vm): self.log.info("createVM|calling initializeVM") ret = vmms.initializeVM(newVM) if not ret: - self.log.debug("createVM|failed initializeVM") + self.log.debug("createVM|failed initializeVM with id %d", newVM.id) return - self.log.info("createVM|done with initializeVM %s" % newVM.id) self.addVM(newVM) self.addToFreePool(newVM) - self.log.debug("createVM: Added vm %s to pool %s" % - (newVM.id, newVM.name)) + self.log.info("createVM|done with initializeVM %s" % newVM.name) + # xxxXXX??? most likely unused, only called by delVM() + ''' def destroyVM(self, vmName, id): """ destroyVM - Called by the delVM API function to remove and destroy a particular VM instance from a pool. We only allow @@ -315,6 +316,7 @@ def destroyVM(self, vmName, id): return 0 else: return -1 + ''' def getAllPools(self): result = {} @@ -322,29 +324,29 @@ def getAllPools(self): result[vmName] = self.getPool(vmName) return result - def getPool(self, vmName): + def getPool(self, pool): """ getPool - returns the members of a pool and its free list """ result = {} - if vmName not in self.machines.keys(): + if pool not in self.machines.keys(): return result result["total"] = [] result["free"] = [] free_list = [] self.lock.acquire() - size = self.machines.get(vmName)[1].qsize() + size = self.machines.get(pool)[1].qsize() for i in range(size): - vm = self.machines.get(vmName)[1].get_nowait() + vm = self.machines.get(pool)[1].get_nowait() free_list.append(vm.id) - machine = self.machines.get(vmName) + machine = self.machines.get(pool) machine[1].put(vm) - self.machines.set(vmName, machine) + self.machines.set(pool, machine) self.lock.release() - result["total"] = self.machines.get(vmName)[0] + result["total"] = self.machines.get(pool)[0] result["free"] = free_list - self.log.info("getPool: free pool %s" % ', '.join(str(x) for x in result["free"])) - self.log.info("getPool: total pool %s" % ', '.join(str(x) for x in result["total"])) + self.log.info("getPool %s: free pool %s" % (pool, result["free"])) + self.log.info("getPool %s: total pool %s" % (pool, result["total"])) return result diff --git a/restful-tango/tangoREST.py b/restful-tango/tangoREST.py index 386dcdcf..8873004d 100644 --- a/restful-tango/tangoREST.py +++ b/restful-tango/tangoREST.py @@ -192,6 +192,7 @@ def convertTangoMachineObj(self, tangoMachine): vm['disk'] = tangoMachine.disk vm['id'] = tangoMachine.id vm['name'] = tangoMachine.name + vm['pool'] = tangoMachine.pool vm['instance_type'] = tangoMachine.instance_type return vm diff --git a/tango.py b/tango.py index 413aae51..ce32e592 100755 --- a/tango.py +++ b/tango.py @@ -130,8 +130,8 @@ def getJobs(self, item): def preallocVM(self, vm, num): """ preallocVM - Set the pool size for VMs of type vm to num """ - self.log.debug("Received preallocVM(%s,%d)request" - % (vm.name, num)) + self.log.debug("Received preallocVM request: %d vms in pool %s" + % (num, vm.pool)) try: vmms = self.preallocator.vmms[vm.vmms] if not vm or num < 0: @@ -159,6 +159,8 @@ def getVMs(self, vmms_name): self.log.error("getVMs request failed: %s" % err) return [] + # xxxXXX??? plan to remove + ''' def delVM(self, vmName, id): """ delVM - delete a specific VM instance from a pool """ @@ -170,6 +172,7 @@ def delVM(self, vmName, id): except Exception as err: self.log.error("delVM request failed: %s" % err) return -1 + ''' def getPool(self, vmName): """ getPool - Return the current members of a pool and its free list @@ -246,17 +249,16 @@ def resetTango(self, vmms): # For each in Tango's name space, destroy the onces in free pool. # AND remove it from Tango's internal bookkeeping. vms = vobj.getVMs() - self.log.debug("Pre-existing VMs: %s" % - [vobj.instanceName(vm.id, vm.name) for vm in vms]) + self.log.debug("Pre-existing VMs: %s" % [vm.name for vm in vms]) + destroyedList = [] removedList = [] for vm in vms: - vmName = vobj.instanceName(vm.id, vm.name) - if re.match("%s-" % Config.PREFIX, vmName): - if vmName not in allFreeVMs: - destroyedList.append(vmName) + if re.match("%s-" % Config.PREFIX, vm.name): + if vm.name not in allFreeVMs: + destroyedList.append(vm.name) if self.preallocator.removeVM(vm, mustFind=False): - removedList.append(vmName) + removedList.append(vm.name) vobj.destroyVM(vm) if destroyedList: diff --git a/tangoObjects.py b/tangoObjects.py index 29ef04d9..2c951d09 100644 --- a/tangoObjects.py +++ b/tangoObjects.py @@ -59,7 +59,6 @@ def __init__(self, image=None, vmms=None, self.disk = disk self.domain_name = domain_name - self.ec2_id = None self.resume = None self.id = None self.instance_id = None @@ -70,10 +69,8 @@ def __init__(self, image=None, vmms=None, # aside for further investigation. self.keepForDebugging = False - # The "name" property is vmms dependent. It doesn't mean the name of - # vm. It's derived from the image name and used as the vm pool name - # The actual vm name is constructed by the instanceName method in each vmms. - self.name = None + self.pool = None + self.name = None # in the form of prefix-id-pool, constructed by the vmms # The image may contain instance type if vmms is ec2. Example: # course101+t2.small. @@ -82,8 +79,8 @@ def __init__(self, image=None, vmms=None, if len(imageParts) == 2: self.image = imageParts[0] self.instance_type = imageParts[1] - (name, ext) = os.path.splitext(self.image) - self.name = name + ("+" + self.instance_type if self.instance_type else "") + (pool, ext) = os.path.splitext(self.image) + self.pool = pool + ("+" + self.instance_type if self.instance_type else "") def __repr__(self): return "TangoMachine(image: %s, vmms: %s, id: %s)" % (self.image, self.vmms, self.id) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 9a531f0c..592e5eb3 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -47,7 +47,7 @@ def destroyVMs(): print "number of Tango VMs:", len(vms) for vm in vms: if vm.id: - print "destroy", nameToPrint(ec2.instanceName(vm.id, vm.name)) + print "destroy", nameToPrint(vm.name) ec2.destroyVM(vm) else: print "VM not in Tango naming pattern:", nameToPrint(vm.name) @@ -57,7 +57,7 @@ def pingVMs(): print "number of Tango VMs:", len(vms) for vm in vms: if vm.id: - print "ping", nameToPrint(ec2.instanceName(vm.id, vm.name)) + print "ping", nameToPrint(vm.name) # Note: following call needs the private key file for aws to be # at wherever SECURITY_KEY_PATH in config.py points to. # For example, if SECURITY_KEY_PATH = '/root/746-autograde.pem', @@ -167,10 +167,9 @@ def addVMs(): instanceTypeTried = False for key in ec2.img2ami.keys(): vm = TangoMachine(vmms="ec2SSH", image=key) - pool = server.preallocator.getPool(vm.name) + pool = server.preallocator.getPool(vm.pool) currentCount = len(pool["total"]) if pool else 0 - print "adding a vm into pool", nameToPrint(vm.name) - print "pool", nameToPrint(vm.name), "current size", currentCount + print "adding a vm into pool", nameToPrint(vm.pool), "current size", currentCount server.preallocVM(vm, currentCount + 1) if instanceTypeTried: @@ -179,10 +178,9 @@ def addVMs(): instanceTypeTried = True vm = TangoMachine(vmms="ec2SSH", image=key+"+t2.small") - pool = server.preallocator.getPool(vm.name) + pool = server.preallocator.getPool(vm.pool) currentCount = len(pool["total"]) if pool else 0 - print "pool", nameToPrint(vm.name), "current size", currentCount - print "adding a vm into pool", nameToPrint(vm.name) + print "adding a vm into pool", nameToPrint(vm.pool), "current size", currentCount server.preallocVM(vm, currentCount + 1) def destroyRedisPools(): @@ -287,7 +285,7 @@ def destroyRedisPools(): # to test non-default access id/key, the aws image must have the key manually # installed or allows the key to be installed by the aws service. # the following assumes we have such image with a "Name" tag "test01.img" - vm.name = "test01" + vm.pool = "test01" ec2WithKey.initializeVM(vm) ec2WithKey.waitVM(vm, Config.WAITVM_TIMEOUT) listInstances() diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index fa81d1da..a2d8a7a3 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -132,12 +132,11 @@ def __init__(self, accessKeyId=None, accessKey=None, ec2User=None): # VMMS helper methods # - def instanceName(self, id, name): + def instanceName(self, id, pool): """ instanceName - Constructs a VM instance name. Always use - this function when you need a VM instance name. Never generate - instance names manually. + this function when you need a VM instance name, or use vm.name """ - return "%s-%d-%s" % (config.Config.PREFIX, id, name) + return "%s-%d-%s" % (config.Config.PREFIX, id, pool) def keyPairName(self, id, name): """ keyPairName - Constructs a unique key pair name. @@ -231,9 +230,9 @@ def initializeVM(self, vm): # Create the instance and obtain the reservation newInstance = None try: - instanceName = self.instanceName(vm.id, vm.name) + vm.name = self.instanceName(vm.id, vm.pool) ec2instance = self.tangoMachineToEC2Instance(vm) - self.log.info("initializeVM: %s %s" % (instanceName, str(ec2instance))) + self.log.info("initializeVM: %s %s" % (vm.name, str(ec2instance))) # ensure that security group exists self.createSecurityGroup() if self.useDefaultKeyPair: @@ -259,11 +258,11 @@ def initializeVM(self, vm): if newInstance: # Assign name to EC2 instance self.boto3resource.create_tags(Resources=[newInstance.id], - Tags=[{"Key": "Name", "Value": instanceName}]) + Tags=[{"Key": "Name", "Value": vm.name}]) self.log.info("new instance %s created with name tag %s" % - (newInstance.id, instanceName)) + (newInstance.id, vm.name)) else: - raise ValueError("cannot find new instance for %s" % instanceName) + raise ValueError("cannot find new instance for %s" % vm.name) # Wait for instance to reach 'running' state start_time = time.time() @@ -278,7 +277,7 @@ def initializeVM(self, vm): newInstance.load() # reload the state of the instance for inst in instances.filter(InstanceIds=[newInstance.id]): - self.log.debug("VM %s: is running %s" % (instanceName, newInstance.id)) + self.log.debug("VM %s: is running %s" % (vm.name, newInstance.id)) instanceRunning = True if instanceRunning: @@ -286,15 +285,15 @@ def initializeVM(self, vm): if time.time() - start_time > config.Config.INITIALIZEVM_TIMEOUT: raise ValueError("VM %s: timeout (%d seconds) before reaching 'running' state" % - (instanceName, config.Config.TIMER_POLL_INTERVAL)) + (vm.name, config.Config.TIMER_POLL_INTERVAL)) - self.log.debug("VM %s: Waiting to reach 'running' from 'pending'" % instanceName) + self.log.debug("VM %s: Waiting to reach 'running' from 'pending'" % vm.name) time.sleep(config.Config.TIMER_POLL_INTERVAL) # end of while loop self.log.info( "VM %s | State %s | Reservation %s | Public DNS Name %s | Public IP Address %s" % - (instanceName, + (vm.name, newInstance.state, reservation, newInstance.public_dns_name, @@ -302,8 +301,8 @@ def initializeVM(self, vm): # Save domain and id ssigned by EC2 in vm object vm.domain_name = newInstance.public_ip_address - vm.ec2_id = newInstance.id - self.log.debug("VM %s: %s" % (instanceName, newInstance)) + vm.instance_id = newInstance.id + self.log.debug("VM %s: %s" % (vm.name, newInstance)) return vm except Exception as e: @@ -319,19 +318,18 @@ def waitVM(self, vm, max_secs): VM is a boto.ec2.instance.Instance object. """ - self.log.info("WaitVM: %s, ec2_id: %s" % (vm.id, vm.ec2_id)) + self.log.info("WaitVM: %s %s" % (vm.name, vm.instance_id)) # test if the vm is still an instance if not self.existsVM(vm): - self.log.info("VM %s: no longer an instance" % vm.id) + self.log.info("VM %s: no longer an instance" % vm.name) return -1 # First, wait for ping to the vm instance to work instance_down = 1 - instanceName = self.instanceName(vm.id, vm.name) start_time = time.time() domain_name = self.domainName(vm) - self.log.info("WaitVM: pinging %s" % domain_name) + self.log.info("WaitVM: pinging %s %s" % (domain_name, vm.name)) while instance_down: instance_down = subprocess.call("ping -c 1 %s" % (domain_name), shell=True, @@ -349,7 +347,7 @@ def waitVM(self, vm, max_secs): # The ping worked, so now wait for SSH to work before # declaring that the VM is ready - self.log.debug("VM %s: ping completed" % (vm.id)) + self.log.debug("VM %s: ping completed" % (vm.name)) while(True): elapsed_secs = time.time() - start_time @@ -357,8 +355,7 @@ def waitVM(self, vm, max_secs): # Give up if the elapsed time exceeds the allowable time if elapsed_secs > max_secs: self.log.info( - "VM %s: SSH timeout after %d secs" % - (instanceName, elapsed_secs)) + "VM %s: SSH timeout after %d secs" % (vm.name, elapsed_secs)) return -1 # If the call to ssh returns timeout (-1) or ssh error @@ -369,8 +366,7 @@ def waitVM(self, vm, max_secs): ["%s@%s" % (self.ec2User, domain_name), "(:)"], max_secs - elapsed_secs) - self.log.debug("VM %s: ssh returned with %d" % - (instanceName, ret)) + self.log.debug("VM %s: ssh returned with %d" % (vm.name, ret)) if (ret != -1) and (ret != 255): return 0 @@ -405,8 +401,7 @@ def runJob(self, vm, runTimeout, maxOutputFileSize): redirect output to file "output". """ domain_name = self.domainName(vm) - self.log.debug("runJob: Running job on VM %s" % - self.instanceName(vm.id, vm.name)) + self.log.debug("runJob: Running job on VM %s" % vm.name) # Setting arguments for VM and running job runcmd = "/usr/bin/time --output=time.out autodriver \ @@ -476,31 +471,30 @@ def destroyVM(self, vm): """ self.log.info("destroyVM: %s %s %s %s" % - (vm.ec2_id, vm.name, vm.keepForDebugging, vm.notes)) + (vm.instance_id, vm.name, vm.keepForDebugging, vm.notes)) try: # Keep the vm and mark with meaningful tags for debugging if hasattr(config.Config, 'KEEP_VM_AFTER_FAILURE') and \ config.Config.KEEP_VM_AFTER_FAILURE and vm.keepForDebugging: - iName = self.instanceName(vm.id, vm.name) - self.log.info("Will keep VM %s for further debugging" % iName) - instance = self.boto3resource.Instance(vm.ec2_id) + self.log.info("Will keep VM %s for further debugging" % vm.name) + instance = self.boto3resource.Instance(vm.instance_id) # delete original name tag and replace it with "failed-xyz" # add notes tag for test name - tag = self.boto3resource.Tag(vm.ec2_id, "Name", iName) + tag = self.boto3resource.Tag(vm.instance_id, "Name", vm.name) if tag: tag.delete() - instance.create_tags(Tags=[{"Key": "Name", "Value": "failed-" + iName}]) + instance.create_tags(Tags=[{"Key": "Name", "Value": "failed-" + vm.name}]) instance.create_tags(Tags=[{"Key": "Notes", "Value": vm.notes}]) return - self.boto3resource.instances.filter(InstanceIds=[vm.ec2_id]).terminate() + self.boto3resource.instances.filter(InstanceIds=[vm.instance_id]).terminate() # delete dynamically created key if not self.useDefaultKeyPair: self.deleteKeyPair() except Exception as e: - self.log.error("destroyVM init Failed: %s for vm %s" % (e, vm.ec2_id)) + self.log.error("destroyVM init Failed: %s for vm %s" % (e, vm.instance_id)) pass def safeDestroyVM(self, vm): @@ -525,26 +519,27 @@ def getVMs(self): for inst in self.boto3resource.instances.filter(Filters=filters): vm = TangoMachine() # make a Tango internal vm structure - vm.ec2_id = inst.id + vm.instance_id = inst.id vm.id = None # the serial number as in inst name PREFIX-serial-IMAGE vm.domain_name = None instName = self.getTag(inst.tags, "Name") # Name tag is the standard form of prefix-serial-image - if instName and re.match("%s-" % config.Config.PREFIX, instName): - vm.id = int(instName.split("-")[1]) - vm.name = instName.split("-")[2] - else: - continue # instance not belong Tango. Skip + if not (instName and re.match("%s-" % config.Config.PREFIX, instName)): + self.log.debug('getVMs: Instance id %s skipped' % vm.instance_id) + continue # instance without name tag or proper prefix + vm.id = int(instName.split("-")[1]) + vm.pool = instName.split("-")[2] + vm.name = instName if inst.public_ip_address: vm.domain_name = inst.public_ip_address - - self.log.debug('getVMs: Instance id %s, pool %s, vm id %s' % - (vm.ec2_id, vm.name, vm.id)) vms.append(vm) + self.log.debug('getVMs: Instance id %s, name %s' % + (vm.instance_id, vm.name)) return vms + except Exception as e: self.log.debug("getVMs Failed: %s" % e) @@ -554,8 +549,8 @@ def existsVM(self, vm): filters=[{'Name': 'instance-state-name', 'Values': ['running']}] instances = self.boto3resource.instances.filter(Filters=filters) - for inst in instances.filter(InstanceIds=[vm.ec2_id]): - self.log.debug("VM %s: exists and running" % vm.ec2_id) + for inst in instances.filter(InstanceIds=[vm.instance_id]): + self.log.debug("VM %s %s: exists and running" % (vm.instance_id, vm.name)) return True return False diff --git a/worker.py b/worker.py index 213c5229..4f87f3fb 100644 --- a/worker.py +++ b/worker.py @@ -148,8 +148,8 @@ def afterJobExecution(self, hdrfile, msg, returnVM): return def jobLogAndTrace(self, stageMsg, vm, status=None): - msg = stageMsg + " %s for job %s:%d" % (self.vmms.instanceName(vm.id, vm.name), - self.job.name, self.job.id) + msg = stageMsg + " %s for job %s:%d" % (vm.name, self.job.name, self.job.id) + if (status != None): if (status == 0): msg = "done " + msg @@ -188,7 +188,7 @@ def run(self): # Assigning job to a new VM else: - self.job.vm.id = self.job.id + self.job.vm.id = self.job.id # xxxXXX??? don't know how this works self.job.updateRemote() # Host name returned from EC2 is stored in the vm object From f9e03c250e596ad32a9d669fce9e4bfff3d60d35 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 28 Aug 2018 13:54:16 -0400 Subject: [PATCH 109/131] remove bad references to locations. --- tools/config_for_run_jobs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py index ed03329d..6453dd94 100644 --- a/tools/config_for_run_jobs.py +++ b/tools/config_for_run_jobs.py @@ -13,8 +13,6 @@ class Config: course = "czang-exp" # YOUR root dir for course/lab definitions and handin (student submissions) - courseRoot = "/n/scratch/czang/f16/" - #courseRoot = "/n/scratch/czang/f17/" courseRoot = "/mnt/data/f16/" # YOUR lab definitions. The index of the lab is given to run_job.py From 88ac6e198a2c454125433415e0e2b1b31811740f Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 30 Aug 2018 13:45:58 -0400 Subject: [PATCH 110/131] log file copied from vm is now readable to all. --- worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/worker.py b/worker.py index 4f87f3fb..bd7cfe55 100644 --- a/worker.py +++ b/worker.py @@ -141,6 +141,7 @@ def afterJobExecution(self, hdrfile, msg, returnVM): # Update the text that users see in the autodriver output file self.appendMsg(hdrfile, msg) self.catFiles(hdrfile, self.job.outputFile) + os.chmod(self.job.outputFile, 0o644) # Thread exit after termination self.detachVM(return_vm=returnVM) From 77a26facc15ceaaa64e84e3370aaab633c763f5c Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Thu, 15 Nov 2018 14:33:13 -0500 Subject: [PATCH 111/131] Add script to check the health of Tango. --- tools/check_jobs.py | 99 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 tools/check_jobs.py diff --git a/tools/check_jobs.py b/tools/check_jobs.py new file mode 100644 index 00000000..f2c8c810 --- /dev/null +++ b/tools/check_jobs.py @@ -0,0 +1,99 @@ +import os, re, glob, datetime, time, json, string +from dateutil import parser +import smtplib +from email.mime.text import MIMEText + +from config_for_run_jobs import Config +from util import Cmd +from util import CommandLine +from util import Lab +import util + +# Drive exiting student submissions to Tango. +# Find course/lab at specified location and submits work from the handin directory. +# Then wait for job output files. +# +# Use -h to show usage. +# See config_for_run_jobs.py for configuration options. + +cfg = Config() +cmd = Cmd(cfg, None) +jsonResult = {} +reportedJobs = [] +mailbodyP1 = "" +mailbodyP2 = "\nDetails:\n" + +def sendmail(): + global mailbodyP1, mailbodyP2 + + if not mailbodyP1: + print "No error to report @ %s" % datetime.datetime.now() + return + + print "email report @ %s" % datetime.datetime.now() + HOST = "smtp.pdl.local.cmu.edu" + SUBJECT = "Autolab trouble @ %s" % datetime.datetime.now() + FROM = "czang@cmu.edu" + TO = "czang@cmu.edu" + BODY = string.join(( + "From: %s" % FROM, + "To: %s" % TO, + "Subject: %s" % SUBJECT , + "", + mailbodyP1 + mailbodyP2 + ), "\r\n") + server = smtplib.SMTP(HOST) + # server.sendmail(FROM, ["czang@cmu.edu", "jboles@cmu.edu"], BODY) + server.sendmail(FROM, ["czang@cmu.edu"], BODY) + server.quit() + +def report(jobId, msg): + global mailbodyP1, mailbodyP2 + email = "" + + if jobId in reportedJobs: + return + + for job in jsonResult["jobs"]: + if job["id"] == jobId: + mailbodyP2 += json.dumps(job, indent=2, sort_keys=True) + matchObj = re.match(r'(.*)_[0-9]+_(.*)', job["name"], re.M|re.I) + email = matchObj.group(2) + mailbodyP1 += "job " + str(jobId) + ", student " + email + ": " + msg + "\n" + +# use a dump file for testing +with open('./testData') as jsonData: + jsonResult = json.load(jsonData) + +while 1: + # jsonResult = cmd.returnLiveJobs() # comment out this line to use test data + + for job in jsonResult["jobs"]: + jobId = job["id"] + if "trace" not in job: + report(jobId, "Can't find trace for the job") + continue + + lastLineOfTrace = job["trace"][-1] + (timeStr, msg) = lastLineOfTrace.split("|") + timestamp = parser.parse(timeStr) + action = msg.split()[0] + jobTimeout = job["timeout"] + + now = datetime.datetime.now() + elapsed = (now - timestamp).total_seconds() + if action == "running" and elapsed > (jobTimeout + 120): + report(jobId, "Job should be timed out") + elif elapsed > 120: + report(jobId, "It's been too long since last trace") + # end of for loop + + sendmail() + + print "sleep for a while..." + break + time.sleep(60) +# end of while loop + +exit() + From 09bca0a83184f49a28a8f78dcaf74478ddc3ef20 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 19 Nov 2018 12:36:35 -0500 Subject: [PATCH 112/131] check in a working version of check_jobs --- tools/check_jobs.py | 69 +++++++++++++++++++++++++-------------------- tools/util.py | 3 ++ 2 files changed, 42 insertions(+), 30 deletions(-) diff --git a/tools/check_jobs.py b/tools/check_jobs.py index f2c8c810..de985533 100644 --- a/tools/check_jobs.py +++ b/tools/check_jobs.py @@ -18,6 +18,9 @@ cfg = Config() cmd = Cmd(cfg, None) + +REPORTED_JOBS_PATH = "/var/run/tango/check_jobs.json" + jsonResult = {} reportedJobs = [] mailbodyP1 = "" @@ -43,8 +46,8 @@ def sendmail(): mailbodyP1 + mailbodyP2 ), "\r\n") server = smtplib.SMTP(HOST) - # server.sendmail(FROM, ["czang@cmu.edu", "jboles@cmu.edu"], BODY) - server.sendmail(FROM, ["czang@cmu.edu"], BODY) + server.sendmail(FROM, ["czang@cmu.edu", "jboles@cmu.edu"], BODY) + # server.sendmail(FROM, ["czang@cmu.edu"], BODY) server.quit() def report(jobId, msg): @@ -54,46 +57,52 @@ def report(jobId, msg): if jobId in reportedJobs: return + reportedJobs.append(jobId) for job in jsonResult["jobs"]: if job["id"] == jobId: mailbodyP2 += json.dumps(job, indent=2, sort_keys=True) matchObj = re.match(r'(.*)_[0-9]+_(.*)', job["name"], re.M|re.I) email = matchObj.group(2) mailbodyP1 += "job " + str(jobId) + ", student " + email + ": " + msg + "\n" - + # use a dump file for testing -with open('./testData') as jsonData: - jsonResult = json.load(jsonData) +if 0: + with open('./testData') as jsonData: + jsonResult = json.load(jsonData) -while 1: - # jsonResult = cmd.returnLiveJobs() # comment out this line to use test data +# read the jobs that have been reported +try: + with open(REPORTED_JOBS_PATH) as jsonData: + reportedJobs = json.load(jsonData) +except: + reportedJobs = [] - for job in jsonResult["jobs"]: - jobId = job["id"] - if "trace" not in job: - report(jobId, "Can't find trace for the job") - continue +jsonResult = cmd.returnLiveJobs() # comment out this line to use test data - lastLineOfTrace = job["trace"][-1] - (timeStr, msg) = lastLineOfTrace.split("|") - timestamp = parser.parse(timeStr) - action = msg.split()[0] - jobTimeout = job["timeout"] - - now = datetime.datetime.now() - elapsed = (now - timestamp).total_seconds() - if action == "running" and elapsed > (jobTimeout + 120): - report(jobId, "Job should be timed out") - elif elapsed > 120: - report(jobId, "It's been too long since last trace") - # end of for loop +for job in jsonResult["jobs"]: + jobId = job["id"] + if "trace" not in job: + report(jobId, "Can't find trace for the job") + continue + + lastLineOfTrace = job["trace"][-1] + (timeStr, msg) = lastLineOfTrace.split("|") + timestamp = parser.parse(timeStr) + action = msg.split()[0] + jobTimeout = job["timeout"] - sendmail() + now = datetime.datetime.now() + elapsed = (now - timestamp).total_seconds() + if action == "running": + if elapsed > (jobTimeout + 120): + report(jobId, "Job should be timed out") + elif elapsed > 120: + report(jobId, "It's been too long since last trace") +# end of for loop - print "sleep for a while..." - break - time.sleep(60) -# end of while loop +sendmail() +with open(REPORTED_JOBS_PATH, 'w') as outfile: + json.dump(reportedJobs, outfile) exit() diff --git a/tools/util.py b/tools/util.py index 015a6c3c..a873163d 100644 --- a/tools/util.py +++ b/tools/util.py @@ -119,6 +119,9 @@ def poll(self, lab, studentFile): myCmd = " --poll -l " + lab.courseLab self.run(myCmd + " --outputFile " + studentFile["output"]) + def returnLiveJobs(self): + return json.loads(self.runAndOutput(" --jobs ").splitlines()[1]) + def jobs(self): result = json.loads(self.runAndOutput(" --jobs ").splitlines()[1]) nJobs = len(result["jobs"]) From d24fdb4583c7e76f570581866f455e601280d35c Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 20 Nov 2018 16:37:18 -0500 Subject: [PATCH 113/131] fix: timezone issue, default instance type not observed, undefined variable --- tango.py | 3 ++- vmms/ec2SSH.py | 20 ++++---------------- worker.py | 1 + 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/tango.py b/tango.py index ce32e592..ef7d300b 100755 --- a/tango.py +++ b/tango.py @@ -204,7 +204,8 @@ def getInfo(self): stats['runjob_errors'] = Config.runjob_errors stats['copyout_errors'] = Config.copyout_errors stats['num_threads'] = threading.activeCount() - stats['timezone_offset'] = time.altzone + isdst = (time.struct_time.tm_isdst == 1) + stats['timezone_offset'] = time.altzone if isdst else time.timezone (zone, daylight) = time.tzname stats['timezone_name'] = zone + ("" if not daylight else ("/" + daylight)) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index a2d8a7a3..3cf12bdf 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -156,25 +156,13 @@ def tangoMachineToEC2Instance(self, vm): """ ec2instance = dict() - memory = vm.memory # in Kbytes - cores = vm.cores + # Note: Unlike other vmms backend, instance type is chosen from + # the optional instance type attached to image name as + # "image+instance_type", such as my_course_mage+t2.small. + ec2instance['instance_type'] = config.Config.DEFAULT_INST_TYPE if vm.instance_type: ec2instance['instance_type'] = vm.instance_type - elif (cores == 1 and memory <= 613 * 1024): - ec2instance['instance_type'] = 't2.micro' - elif (cores == 1 and memory <= 1.7 * 1024 * 1024): - ec2instance['instance_type'] = 'm1.small' - elif (cores == 1 and memory <= 3.75 * 1024 * 1024): - ec2instance['instance_type'] = 'm3.medium' - elif (cores == 2): - ec2instance['instance_type'] = 'm3.large' - elif (cores == 4): - ec2instance['instance_type'] = 'm3.xlarge' - elif (cores == 8): - ec2instance['instance_type'] = 'm3.2xlarge' - else: - ec2instance['instance_type'] = config.Config.DEFAULT_INST_TYPE ec2instance['ami'] = self.img2ami[vm.image].id self.log.info("tangoMachineToEC2Instance: %s" % str(ec2instance)) diff --git a/worker.py b/worker.py index bd7cfe55..43a8da01 100644 --- a/worker.py +++ b/worker.py @@ -175,6 +175,7 @@ def run(self): self.log.debug("Run worker") vm = None + msg = "" # Header message for user hdrfile = tempfile.mktemp() From 6b6de4c7924c5b4f9c68c8720113d0fc7057c8c7 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 14 Aug 2019 15:06:24 -0400 Subject: [PATCH 114/131] Fix check_jobs (cron job in production Autolab): comment its purpose, report file writing failure, always write current trouble jobs into file --- tools/check_jobs.py | 63 +++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/tools/check_jobs.py b/tools/check_jobs.py index de985533..1952951e 100644 --- a/tools/check_jobs.py +++ b/tools/check_jobs.py @@ -9,27 +9,35 @@ from util import Lab import util -# Drive exiting student submissions to Tango. -# Find course/lab at specified location and submits work from the handin directory. -# Then wait for job output files. -# -# Use -h to show usage. -# See config_for_run_jobs.py for configuration options. +# This script is run as a cron job every minute to detect potentially +# stuck jobs and send email to the administrator. +# It asks Tango for the live jobs. Then it looks at the last-seen +# timestamp in each job's trace to determine if it's a "slow" job. +# It keeps the questionable jobs in a file so that they are not +# reported again by the next execution of this script. +# Potential false negative: Suppose Tango dies and is restarted, +# then the jobIds stored in the "reported jobs" file from Tango's last +# incarnation may overlap with the current jobIds. In such case, +# the overlapping jobIds will not be reported. However, when Tango +# is stuck there usually will be more stuck jobs to be reported for +# the admin's attention. cfg = Config() cmd = Cmd(cfg, None) -REPORTED_JOBS_PATH = "/var/run/tango/check_jobs.json" +REPORTED_JOBS_PATH = "/var/log/tango/lastSeenSlowJobsBy_check_jobs" jsonResult = {} -reportedJobs = [] +reportedJobs = [] # trouble jobs found in last execution +troubleJobs = [] # trouble jobs found in this execution +writeFailure = "" mailbodyP1 = "" -mailbodyP2 = "\nDetails:\n" +mailbodyP2 = "" def sendmail(): global mailbodyP1, mailbodyP2 - if not mailbodyP1: + if not mailbodyP1 and not writeFailure: print "No error to report @ %s" % datetime.datetime.now() return @@ -43,7 +51,7 @@ def sendmail(): "To: %s" % TO, "Subject: %s" % SUBJECT , "", - mailbodyP1 + mailbodyP2 + writeFailure + mailbodyP1 + mailbodyP2 ), "\r\n") server = smtplib.SMTP(HOST) server.sendmail(FROM, ["czang@cmu.edu", "jboles@cmu.edu"], BODY) @@ -53,17 +61,22 @@ def sendmail(): def report(jobId, msg): global mailbodyP1, mailbodyP2 email = "" - + + # add into trouble list but may not report this time + troubleJobs.append(jobId) if jobId in reportedJobs: return - reportedJobs.append(jobId) - for job in jsonResult["jobs"]: - if job["id"] == jobId: - mailbodyP2 += json.dumps(job, indent=2, sort_keys=True) - matchObj = re.match(r'(.*)_[0-9]+_(.*)', job["name"], re.M|re.I) - email = matchObj.group(2) - mailbodyP1 += "job " + str(jobId) + ", student " + email + ": " + msg + "\n" + # go through the job list to find the job by jobId + for job in jsonResult["jobs"]: + if job["id"] != jobId: continue + if not mailbodyP1: + mailbodyP1 = "\nTrouble jobs:\n" + mailbodyP2 = "\nJob details:\n" + matchObj = re.match(r'(.*)_[0-9]+_(.*)', job["name"], re.M|re.I) + email = matchObj.group(2) + mailbodyP1 += "job %s, student %s: %s\n" % (jobId, email, msg) + mailbodyP2 += json.dumps(job, indent=2, sort_keys=True) # use a dump file for testing if 0: @@ -78,7 +91,7 @@ def report(jobId, msg): reportedJobs = [] jsonResult = cmd.returnLiveJobs() # comment out this line to use test data - + for job in jsonResult["jobs"]: jobId = job["id"] if "trace" not in job: @@ -100,9 +113,15 @@ def report(jobId, msg): report(jobId, "It's been too long since last trace") # end of for loop +# write troubled jobs found in this execution to file +try: + with open(REPORTED_JOBS_PATH, 'w') as outfile: + json.dump(troubleJobs, outfile) +except Exception as e: + writeFailure = "Failed to write to %s: %s\n" % (REPORTED_JOBS_PATH, e) + +# report trouble jobs AND maybe failure of writing to file sendmail() -with open(REPORTED_JOBS_PATH, 'w') as outfile: - json.dump(reportedJobs, outfile) exit() From 8f0ed665b6b432837cec7fda38b7ed4f6a8e0581 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 20 Aug 2019 16:48:41 -0400 Subject: [PATCH 115/131] Add README for autodriver --- autodriver/README | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 autodriver/README diff --git a/autodriver/README b/autodriver/README new file mode 100644 index 00000000..999458df --- /dev/null +++ b/autodriver/README @@ -0,0 +1,25 @@ +To build a grading vm image for Autolab jobs: + +Create a vm with a stock linux image +Copy autodriver.c and Makefile to the vm and compile it to autodriver +Copy autodriver to any common path, make it owned by root wtih setuid bits. +For example: -rwsr-sr-x 1 root root /usr/bin/autodriver + +Create the following users +autolab: The ssh/scp user tied with selected key pair of you cloud account +autograde: The user to run TA's grader starting from the top Makefile (see autodriver.c) +student: For student to use the exact image for coding/testing + +The sequence of grading using the above image is such: + +The grading engine: scp top level Makefile, autograde.tar (both made by course staff) +and student's submission to the grading vm. + +The grading engine: ssh to run autodriver program. + +The greating vm: autodriver program (running as root because of the setuid bit) starts +a child process (running as user autograde) to run "make" with top level Makefile. + +The grading engine: scp the output file from the grading vm. + + From 7a18dbc57d922f24c4ed06b5ae6281d3101fd049 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 3 Sep 2019 13:11:44 -0400 Subject: [PATCH 116/131] time.localtime().tm_isdst gives the correct answer to day light saving. --- tango.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tango.py b/tango.py index ef7d300b..418c372f 100755 --- a/tango.py +++ b/tango.py @@ -59,7 +59,7 @@ def __init__(self): format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", level=Config.LOGLEVEL, ) - + vmms = None if Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH @@ -81,7 +81,7 @@ def __init__(self): # memory between processes. Otherwise, JobManager will # be initiated separately JobManager(self.jobQueue).start() - + self.start_time = time.time() self.log = logging.getLogger("TangoServer") self.log.info("Starting Tango server") @@ -204,11 +204,11 @@ def getInfo(self): stats['runjob_errors'] = Config.runjob_errors stats['copyout_errors'] = Config.copyout_errors stats['num_threads'] = threading.activeCount() - isdst = (time.struct_time.tm_isdst == 1) + isdst = (time.localtime().tm_isdst > 0) stats['timezone_offset'] = time.altzone if isdst else time.timezone (zone, daylight) = time.tzname stats['timezone_name'] = zone + ("" if not daylight else ("/" + daylight)) - + return stats # @@ -310,7 +310,7 @@ def __validateJob(self, job, vmms): imgList = vobj.getImages() if job.vm.image not in imgList: self.log.error("validateJob: Image not found: %s" % job.vm.image) - + job.appendTrace("validateJob: Image not found: %s" % job.vm.image) errors += 1 @@ -361,7 +361,7 @@ def __validateJob(self, job, vmms): if not hasMakefile: self.log.error("validateJob: Missing Makefile in input files.") job.appendTrace("validateJob: Missing Makefile in input files.") - errors+=1 + errors+=1 # Check if job timeout has been set; If not set timeout to default if not job.timeout or job.timeout <= 0: From 48568e9b085beaadf2ad7d897246aba7bbf47c42 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 3 Sep 2019 13:16:38 -0400 Subject: [PATCH 117/131] Deal with exception inside exception handler of initializeVM. Or all Tango jobs will be stuck in such cases. --- vmms/ec2SSH.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 3cf12bdf..1c6668dc 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -294,9 +294,13 @@ def initializeVM(self, vm): return vm except Exception as e: - self.log.error("initializeVM Failed: %s" % e) + self.log.error("initializeVM Failed for vm %s: %s" % (vm.name, e)) if newInstance: - self.boto3resource.instances.filter(InstanceIds=[newInstance.id]).terminate() + try: + self.boto3resource.instances.filter(InstanceIds=[newInstance.id]).terminate() + except Exception as e: + self.log.error("Exception handling failed for %s: %s" % (vm.name, e)) + return None return None def waitVM(self, vm, max_secs): From 0b1b3449b41491255a44ea6411cd1b52723065dc Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 3 Sep 2019 16:54:54 -0400 Subject: [PATCH 118/131] Clean existing output files for those student jobs selected to run. --- tools/run_jobs.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/run_jobs.py b/tools/run_jobs.py index 97881110..f35a826c 100644 --- a/tools/run_jobs.py +++ b/tools/run_jobs.py @@ -53,7 +53,7 @@ email = matchObj.group(1) versionStr = matchObj.group(2) version = int(versionStr) - + withoutSuffix = baseName.replace(lab.handinSuffix, "") outputFile = withoutSuffix + "_" + lab.name + ".txt" jobName = lab.courseLab + "_" + withoutSuffix @@ -181,12 +181,22 @@ cmd.upload(lab, lab.makefile) cmd.upload(lab, lab.autogradeTar) + # before sending the jobs, clean the existing ouput files. + # also collect the files locations for output file waiting. + for i in studentIndexList: + outputFile = lab.outputDir + "/" + student2file[students[i]]["output"] + outputFiles.append(outputFile) + try: + os.remove(outputFile) + print "# Delete existing output file:", outputFile + except OSError: + pass + # load and run student submission for i in studentIndexList: print ("\n# Submit %s for lab %s" % (students[i], lab.name)) cmd.upload(lab, student2file[students[i]]["full"]) cmd.addJob(lab, student2file[students[i]]) - outputFiles.append(lab.outputDir + "/" + student2file[students[i]]["output"]) # end of main loop "cmdLine.args.indecies" if cmdLine.args.dry_run: From a49c447b55675c3f73a275cd47fe16373f37376e Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 9 Sep 2019 10:49:17 -0400 Subject: [PATCH 119/131] Commit changes to build/admin files. --- Dockerfile | 41 +++++++++++++----------------- deployment/config/supervisord.conf | 9 ++++--- 2 files changed, 23 insertions(+), 27 deletions(-) diff --git a/Dockerfile b/Dockerfile index f2f3c4eb..8ef850b9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,26 +1,19 @@ # Start with empty ubuntu machine -FROM ubuntu:15.04 +FROM ubuntu MAINTAINER Autolab Development Team "autolab-dev@andrew.cmu.edu" # Setup correct environment variable ENV HOME /root -# Change to working directory -WORKDIR /opt - -# Move all code into Tango directory -ADD . TangoService/Tango/ -WORKDIR /opt/TangoService/Tango -RUN mkdir volumes - -WORKDIR /opt +RUN mkdir -p /opt/TangoFiles/volumes /opt/TangoFiles/courselabs /opt/TangoFiles/output # Install dependancies RUN apt-get update && apt-get install -y \ nginx \ curl \ git \ + iputils-ping \ vim \ supervisor \ python-pip \ @@ -28,7 +21,7 @@ RUN apt-get update && apt-get install -y \ build-essential \ tcl8.5 \ wget \ - libgcrypt11-dev \ + libgcrypt11-dev \ zlib1g-dev \ apt-transport-https \ ca-certificates \ @@ -38,13 +31,10 @@ RUN apt-get update && apt-get install -y \ && rm -rf /var/lib/apt/lists/* # Install Redis +WORKDIR /opt RUN wget http://download.redis.io/releases/redis-stable.tar.gz && tar xzf redis-stable.tar.gz WORKDIR /opt/redis-stable -RUN make && make install -WORKDIR /opt/TangoService/Tango/ - -# Install Docker from Docker Inc. repositories. -RUN curl -sSL https://get.docker.com/ | sh +RUN make && make install # Install the magic wrapper. ADD ./wrapdocker /usr/local/bin/wrapdocker @@ -53,23 +43,28 @@ RUN chmod +x /usr/local/bin/wrapdocker # Define additional metadata for our image. VOLUME /var/lib/docker -# Create virtualenv to link dependancies +# Install python dependancies +ADD ./requirements.txt /opt/TangoFiles/requirements.txt +WORKDIR /opt/TangoFiles RUN pip install virtualenv && virtualenv . -# Install python dependancies RUN pip install -r requirements.txt RUN mkdir -p /var/log/docker /var/log/supervisor # Move custom config file to proper location -RUN cp /opt/TangoService/Tango/deployment/config/nginx.conf /etc/nginx/nginx.conf -RUN cp /opt/TangoService/Tango/deployment/config/supervisord.conf /etc/supervisor/supervisord.conf -RUN cp /opt/TangoService/Tango/deployment/config/redis.conf /etc/redis.conf +ADD ./deployment/config/nginx.conf /etc/nginx/nginx.conf +ADD ./deployment/config/supervisord.conf /etc/supervisor/supervisord.conf +ADD ./deployment/config/redis.conf /etc/redis.conf + +#JMB added for EC2 config +ADD ./deployment/config/boto.cfg /etc/boto.cfg +ADD ./deployment/config/746-autograde.pem /root/746-autograde.pem +RUN chmod 600 /root/746-autograde.pem # Reload new config scripts CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"] - -# TODO: +# TODO: # volumes dir in root dir, supervisor only starts after calling start once , nginx also needs to be started # Different log numbers for two different tangos # what from nginx forwards requests to tango diff --git a/deployment/config/supervisord.conf b/deployment/config/supervisord.conf index f06b47f9..cc50e3b2 100644 --- a/deployment/config/supervisord.conf +++ b/deployment/config/supervisord.conf @@ -45,8 +45,8 @@ priority=1 autostart=true autorestart=false user=root -stdout_logfile=/var/log/redis_stdout.log -stderr_logfile=/var/log/redis_stderr.log +stdout_logfile=/var/log/tango/redis_stdout.log +stderr_logfile=/var/log/tango/redis_stderr.log [program:nginx] command=/usr/sbin/nginx -c /etc/nginx/nginx.conf @@ -70,11 +70,12 @@ command=/bin/bash -c 'sleep 5 && python /opt/TangoService/Tango/restful-tango/se autostart=true process_name=%(process_num)01d redirect_stderr=true -stdout_logfile=/opt/TangoService/tango_log.log.%(process_num)01d +stdout_logfile=/var/log/tango/restful-tango.%(process_num)01d.log numprocs=2 [program:tangoJobManager] command=/bin/bash -c 'sleep 5 && python /opt/TangoService/Tango/jobManager.py' autostart=true +autorestart=true redirect_stderr=true -stdout_logfile=/opt/TangoService/tango_job_manager_log.log +stdout_logfile=/var/log/tango/jobManager.log From 42fb51b9e6831c7e68b6dad8445fea79370f0133 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 9 Sep 2019 11:54:56 -0400 Subject: [PATCH 120/131] remove the unusual redis port mapping from run_jobs config file. --- tools/config_for_run_jobs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py index 6453dd94..c7cb40cd 100644 --- a/tools/config_for_run_jobs.py +++ b/tools/config_for_run_jobs.py @@ -11,7 +11,7 @@ class Config: # YOUR course name course = "your-name-experiment" course = "czang-exp" - + # YOUR root dir for course/lab definitions and handin (student submissions) courseRoot = "/mnt/data/f16/" @@ -32,7 +32,7 @@ class Config: totalStudents = 1 # number of students to submit firstStudentNum = None # set to None for all students - + # YOUR Tango container's root dir for submissions and output tangoFileRoot = "/root/autolab-oneclick/server/tango_courselabs" @@ -64,6 +64,5 @@ class Config: # Note: This variable is used by tools/ec2Read.py only so far. redisHostPort = 6379 # default - redisHostPort = 6380 # end of class Config From ae2799f4d8a750a688a0989545d65292a837a7ee Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Mon, 9 Sep 2019 12:46:32 -0400 Subject: [PATCH 121/131] Make redis port available outside the container --- deployment/config/redis.conf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deployment/config/redis.conf b/deployment/config/redis.conf index 6c765691..178688b0 100644 --- a/deployment/config/redis.conf +++ b/deployment/config/redis.conf @@ -32,6 +32,9 @@ ################################ GENERAL ##################################### +### JMB - allow access from outside the container (also added port in docker-compose.yml) +protected-mode no + # By default Redis does not run as a daemon. Use 'yes' if you need it. # Note that Redis will write a pid file in /var/run/redis.pid when daemonized. daemonize no From 4e6c3451099f8ca87cdc9173f408f9de9e4eb215 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Tue, 8 Oct 2019 13:01:03 -0400 Subject: [PATCH 122/131] Swap the order of wating from pending to running and instance tagging. --- vmms/ec2SSH.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 1c6668dc..6f40bd3b 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -243,13 +243,7 @@ def initializeVM(self, vm): time.sleep(config.Config.TIMER_POLL_INTERVAL) newInstance = reservation[0] - if newInstance: - # Assign name to EC2 instance - self.boto3resource.create_tags(Resources=[newInstance.id], - Tags=[{"Key": "Name", "Value": vm.name}]) - self.log.info("new instance %s created with name tag %s" % - (newInstance.id, vm.name)) - else: + if not newInstance: raise ValueError("cannot find new instance for %s" % vm.name) # Wait for instance to reach 'running' state @@ -258,27 +252,33 @@ def initializeVM(self, vm): # Note: You'd think we should be able to read the state from the # instance but that turns out not working. So we round up all # running intances and find our instance by instance id - + filters=[{'Name': 'instance-state-name', 'Values': ['running']}] instances = self.boto3resource.instances.filter(Filters=filters) instanceRunning = False newInstance.load() # reload the state of the instance for inst in instances.filter(InstanceIds=[newInstance.id]): - self.log.debug("VM %s: is running %s" % (vm.name, newInstance.id)) + self.log.debug("VM %s %s: is running" % (vm.name, newInstance.id)) instanceRunning = True if instanceRunning: break if time.time() - start_time > config.Config.INITIALIZEVM_TIMEOUT: - raise ValueError("VM %s: timeout (%d seconds) before reaching 'running' state" % - (vm.name, config.Config.TIMER_POLL_INTERVAL)) + raise ValueError("VM %s %s: timeout (%d seconds) before reaching 'running' state" % + (vm.name, newInstance.id, config.Config.TIMER_POLL_INTERVAL)) - self.log.debug("VM %s: Waiting to reach 'running' from 'pending'" % vm.name) + self.log.debug("VM %s %s: Waiting to reach 'running' from 'pending'" % (vm.name, newInstance.id)) time.sleep(config.Config.TIMER_POLL_INTERVAL) # end of while loop + # tag the instance + self.boto3resource.create_tags(Resources=[newInstance.id], + Tags=[{"Key": "Name", "Value": vm.name}]) + self.log.info("new instance %s created with name tag %s" % + (newInstance.id, vm.name)) + self.log.info( "VM %s | State %s | Reservation %s | Public DNS Name %s | Public IP Address %s" % (vm.name, From defb37f53c38586e3025c5ef592d1d53f9804174 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 9 Oct 2019 13:26:49 -0400 Subject: [PATCH 123/131] Add timed cleanup for untagged stale vms. Add tests --- tools/ec2Read.py | 43 +++++++++++++++++++++++++---- vmms/ec2SSH.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 6 deletions(-) diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 592e5eb3..24544439 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -24,6 +24,8 @@ def __init__(self): help="aws access id, key and user, space separated") parser.add_argument('-c', '--createVMs', action='store_true', dest='createVMs', help="add a VM for each pool") + parser.add_argument('-C', '--createInstance', action='store_true', + dest='createInstance', help="create an instance without adding to a pool") parser.add_argument('-d', '--destroyVMs', action='store_true', dest='destroyVMs', help="destroy VMs and empty pools") parser.add_argument('-D', '--instanceNameTags', nargs='+', @@ -40,18 +42,19 @@ def __init__(self): argListAllInstances = cmdLine.args.listInstances argDestroyVMs = cmdLine.args.destroyVMs argCreateVMs = cmdLine.args.createVMs +argCreateInstance = cmdLine.args.createInstance argAccessIdKeyUser = cmdLine.args.accessIdKeyUser def destroyVMs(): vms = ec2.getVMs() print "number of Tango VMs:", len(vms) for vm in vms: - if vm.id: + if vm.id: print "destroy", nameToPrint(vm.name) ec2.destroyVM(vm) else: print "VM not in Tango naming pattern:", nameToPrint(vm.name) - + def pingVMs(): vms = ec2.getVMs() print "number of Tango VMs:", len(vms) @@ -66,7 +69,7 @@ def pingVMs(): else: print "VM not in Tango naming pattern:", nameToPrint(vm.name) -local_tz = pytz.timezone("EST") +local_tz = pytz.timezone(Config.AUTODRIVER_LOGGING_TIME_ZONE) def utc_to_local(utc_dt): local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) return local_dt.strftime("%Y%m%d-%H:%M:%S") @@ -272,6 +275,37 @@ def destroyRedisPools(): listPools() exit() +# Create number of instances (no pool), some of them without name tag +# to test untagged stale machine cleanup ability in Tango. +# watch tango.log for the cleanup actions. +if argCreateInstance: + i = 0 + while True: + vm = TangoMachine(vmms="ec2SSH") + vm.id = int(datetime.datetime.utcnow().strftime('%s')) + vm.image = '746' + vm.pool = '746' + vm.name = ec2.instanceName(vm.id, vm.pool) + result = ec2.initializeVM(vm) + if result: + print "created: ", result.name, result.instance_id + else: + print "failed to create" + break + + # delete name tage for half of instances + if i % 2 == 0: + boto3connection.delete_tags(Resources=[result.instance_id], + Tags=[{"Key": "Name"}]) + i += 1 + time.sleep(30) + + if i > 20: + break + + time.sleep(10000) + exit() + # ec2WithKey can be used to test the case that tango_cli uses # non-default aws access id and key if argAccessIdKeyUser: @@ -291,6 +325,3 @@ def destroyRedisPools(): listInstances() # Write combination of ops not provided by the command line options here: - - - diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 6f40bd3b..6051e473 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -8,6 +8,9 @@ import re import time import logging +import datetime +from threading import Timer +import pytz import config from tangoObjects import TangoMachine @@ -128,6 +131,12 @@ def __init__(self, accessKeyId=None, accessKey=None, ec2User=None): self.log.info("Ignored images %s for lack of or ill-formed name tag" % str(ignoredAMIs)) + # start a timer to cleanup stale vms + t = Timer(60, self.cleanupUntaggedStaleVMs) + t.daemon = True # timer thread will not hold off process termination + t.start() + # end of __init__ + # # VMMS helper methods # @@ -551,3 +560,65 @@ def getImages(self): """ self.log.info("getImages: %s" % str(list(self.img2ami.keys()))) return list(self.img2ami.keys()) + + def cleanupUntaggedStaleVMs(self): + self.log.info("cleanupUntaggedStaleVMs") + + nameAndInstances = [] + filters=[{'Name': 'instance-state-name', 'Values': ['running', 'pending']}] + instanceType = 'running or pending' + + instances = self.boto3resource.instances.filter(Filters=filters) + for instance in self.boto3resource.instances.filter(Filters=filters): + launchTime = instance.launch_time.ctime() + tmp = instance.launch_time + tmp1 = datetime.datetime.utcnow() + age = int((tmp1.replace(tzinfo=pytz.utc) - tmp.replace(tzinfo=pytz.utc)).total_seconds()) + nameAndInstances.append({"Name": self.getTag(instance.tags, "Name"), + "launchTime": launchTime, + "age": age, + "Instance": instance}) + self.log.info("number of running/pending instances: %d" % len(nameAndInstances)) + + nameNone = [] + named = [] + for item in nameAndInstances: + if item["Name"]: + named.append(item) + else: + nameNone.append(item) + + staleSet = [] + nameNone.sort(key=lambda x: x["age"], reverse=True) # oldest first + for item in nameNone: + instance = item["Instance"] + stale = "" + if item["age"] > config.Config.INITIALIZEVM_TIMEOUT * 2: # multiply 2 to be conservative + staleSet.append(item) + stale = "(STALE)" + self.log.info("[%s]: %s, age: %s, launch time: %s, state: %s %s" % + (item["Name"], instance.id, item["age"], + item["launchTime"], instance.state["Name"], stale)) + + named.sort(key=lambda x: x["Name"]) + for item in named: + instance = item["Instance"] + self.log.info("[%s]: %s, age: %s, launch time: %s, state: %s" % + (item["Name"], instance.id, item["age"], + item["launchTime"], instance.state["Name"])) + + # Delete VMs. Note that we don't do anything to the pools because + # untagged VMs can't enter a pool + for item in staleSet: + instance = item["Instance"] + vm = TangoMachine(vmms="ec2SSH") + vm.instance_id = instance.id + vm.name = None + self.log.info("cleanup untagged stale instance %s, age: %s, launch time: %s" % + (instance.id, item["age"], item["launchTime"])) + self.destroyVM(vm) + + t = Timer(60, self.cleanupUntaggedStaleVMs) + t.daemon = True # timer thread will not hold off process termination + t.start() + # end of cleanupUntaggedStaleVMs() From 3d60ae0e8c3d91ad7481d1bcba41626e51cf4680 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 9 Oct 2019 13:54:41 -0400 Subject: [PATCH 124/131] Use proper timezone to log instance launch time. --- vmms/ec2SSH.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 6051e473..cb4d3543 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -85,6 +85,7 @@ def __init__(self, accessKeyId=None, accessKey=None, ec2User=None): VM created """ + self.local_tz = pytz.timezone(config.Config.AUTODRIVER_LOGGING_TIME_ZONE) self.log = logging.getLogger("Ec2SSH-" + str(os.getpid())) self.log.info("init Ec2SSH") @@ -570,10 +571,11 @@ def cleanupUntaggedStaleVMs(self): instances = self.boto3resource.instances.filter(Filters=filters) for instance in self.boto3resource.instances.filter(Filters=filters): - launchTime = instance.launch_time.ctime() - tmp = instance.launch_time - tmp1 = datetime.datetime.utcnow() - age = int((tmp1.replace(tzinfo=pytz.utc) - tmp.replace(tzinfo=pytz.utc)).total_seconds()) + creationTime = instance.launch_time + localCreationTime = creationTime.replace(tzinfo=pytz.utc).astimezone(self.local_tz) + launchTime = localCreationTime.strftime("%Y%m%d-%H:%M:%S") + nowTime = datetime.datetime.utcnow() + age = int((nowTime.replace(tzinfo=pytz.utc) - creationTime.replace(tzinfo=pytz.utc)).total_seconds()) nameAndInstances.append({"Name": self.getTag(instance.tags, "Name"), "launchTime": launchTime, "age": age, From 50f31a5bdefa4864054a08466a29fd08849eb632 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 9 Oct 2019 14:07:30 -0400 Subject: [PATCH 125/131] Add exception handling to cleanup function. --- vmms/ec2SSH.py | 105 ++++++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 50 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index cb4d3543..4785a976 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -569,57 +569,62 @@ def cleanupUntaggedStaleVMs(self): filters=[{'Name': 'instance-state-name', 'Values': ['running', 'pending']}] instanceType = 'running or pending' - instances = self.boto3resource.instances.filter(Filters=filters) - for instance in self.boto3resource.instances.filter(Filters=filters): - creationTime = instance.launch_time - localCreationTime = creationTime.replace(tzinfo=pytz.utc).astimezone(self.local_tz) - launchTime = localCreationTime.strftime("%Y%m%d-%H:%M:%S") - nowTime = datetime.datetime.utcnow() - age = int((nowTime.replace(tzinfo=pytz.utc) - creationTime.replace(tzinfo=pytz.utc)).total_seconds()) - nameAndInstances.append({"Name": self.getTag(instance.tags, "Name"), - "launchTime": launchTime, - "age": age, - "Instance": instance}) - self.log.info("number of running/pending instances: %d" % len(nameAndInstances)) - - nameNone = [] - named = [] - for item in nameAndInstances: - if item["Name"]: - named.append(item) - else: - nameNone.append(item) - - staleSet = [] - nameNone.sort(key=lambda x: x["age"], reverse=True) # oldest first - for item in nameNone: - instance = item["Instance"] - stale = "" - if item["age"] > config.Config.INITIALIZEVM_TIMEOUT * 2: # multiply 2 to be conservative - staleSet.append(item) - stale = "(STALE)" - self.log.info("[%s]: %s, age: %s, launch time: %s, state: %s %s" % - (item["Name"], instance.id, item["age"], - item["launchTime"], instance.state["Name"], stale)) - - named.sort(key=lambda x: x["Name"]) - for item in named: - instance = item["Instance"] - self.log.info("[%s]: %s, age: %s, launch time: %s, state: %s" % - (item["Name"], instance.id, item["age"], - item["launchTime"], instance.state["Name"])) - - # Delete VMs. Note that we don't do anything to the pools because - # untagged VMs can't enter a pool - for item in staleSet: - instance = item["Instance"] - vm = TangoMachine(vmms="ec2SSH") - vm.instance_id = instance.id - vm.name = None - self.log.info("cleanup untagged stale instance %s, age: %s, launch time: %s" % - (instance.id, item["age"], item["launchTime"])) - self.destroyVM(vm) + try: + instances = self.boto3resource.instances.filter(Filters=filters) + for instance in self.boto3resource.instances.filter(Filters=filters): + creationTime = instance.launch_time + localCreationTime = creationTime.replace(tzinfo=pytz.utc).astimezone(self.local_tz) + launchTime = localCreationTime.strftime("%Y%m%d-%H:%M:%S") + nowTime = datetime.datetime.utcnow() + age = int((nowTime.replace(tzinfo=pytz.utc) - creationTime.replace(tzinfo=pytz.utc)).total_seconds()) + nameAndInstances.append({"Name": self.getTag(instance.tags, "Name"), + "launchTime": launchTime, + "age": age, + "Instance": instance}) + self.log.info("number of running/pending instances: %d" % len(nameAndInstances)) + + nameNone = [] + named = [] + for item in nameAndInstances: + if item["Name"]: + named.append(item) + else: + nameNone.append(item) + + staleSet = [] + nameNone.sort(key=lambda x: x["age"], reverse=True) # oldest first + for item in nameNone: + instance = item["Instance"] + stale = "" + if item["age"] > config.Config.INITIALIZEVM_TIMEOUT * 2: # multiply 2 to be conservative + staleSet.append(item) + stale = "(STALE)" + self.log.info("[%s]: %s, age: %s, launch time: %s, state: %s %s" % + (item["Name"], instance.id, item["age"], + item["launchTime"], instance.state["Name"], stale)) + + named.sort(key=lambda x: x["Name"]) + for item in named: + instance = item["Instance"] + self.log.info("[%s]: %s, age: %s, launch time: %s, state: %s" % + (item["Name"], instance.id, item["age"], + item["launchTime"], instance.state["Name"])) + + # Delete VMs. Note that we do nothing to the pools because + # untagged VMs can't enter a pool + for item in staleSet: + instance = item["Instance"] + vm = TangoMachine(vmms="ec2SSH") + vm.instance_id = instance.id + vm.name = None + self.log.info("cleanup untagged stale instance %s, age: %s, launch time: %s" % + (instance.id, item["age"], item["launchTime"])) + self.destroyVM(vm) + + except Exception as e: + self.log.debug("cleanupUntaggedStaleVMs exception: %s" % e) + # set the next time interval t = Timer(60, self.cleanupUntaggedStaleVMs) t.daemon = True # timer thread will not hold off process termination t.start() From 870757f5ab1cf90ebf4d485e0ff3c52b1174abfd Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 9 Oct 2019 14:15:15 -0400 Subject: [PATCH 126/131] Add python pytz package. --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 8ef850b9..f0b989b6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,6 +48,7 @@ ADD ./requirements.txt /opt/TangoFiles/requirements.txt WORKDIR /opt/TangoFiles RUN pip install virtualenv && virtualenv . RUN pip install -r requirements.txt +RUN pip install pytz RUN mkdir -p /var/log/docker /var/log/supervisor From fc2dec63928a53980c09a70f85739e67bf7c7ae2 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 9 Oct 2019 16:42:11 -0400 Subject: [PATCH 127/131] Manually start the cleanup function from test script. --- tools/config_for_run_jobs.py | 9 ++------- tools/ec2Read.py | 6 ++++++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tools/config_for_run_jobs.py b/tools/config_for_run_jobs.py index c7cb40cd..80b86f76 100644 --- a/tools/config_for_run_jobs.py +++ b/tools/config_for_run_jobs.py @@ -17,13 +17,8 @@ class Config: # YOUR lab definitions. The index of the lab is given to run_job.py labs = [ - {"name": "cloudfscheckpoint2dedup", "handinSuffix": ".tar", "image": "penndot.img"}, - {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "penndot.img"}, - {"name": "myftlcheckpoint2", "handinSuffix": ".cpp", "image": "746.img"}, - {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "746.img"}, - {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "xyz.img"}, - {"name": "myftlcheckpoint3", "handinSuffix": ".cpp", "image": "xyz.img"}, - {"name": "cloudfscheckpoint1fuse", "handinSuffix": ".tar", "image": "xyz.img"}] + {"name": "cloudfscheckpoint2dedup", "handinSuffix": ".tar", "image": "746"}, + {"name": "myftlcheckpoint1", "handinSuffix": ".cpp", "image": "746"}] # Range of student submissions to run (sorted by student emails) # If either is None, all student submissions are run, unless diff --git a/tools/ec2Read.py b/tools/ec2Read.py index 24544439..ed390923 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -279,6 +279,12 @@ def destroyRedisPools(): # to test untagged stale machine cleanup ability in Tango. # watch tango.log for the cleanup actions. if argCreateInstance: + # The cleanup function is not active unless the application is + # jobManager. Therefore we start it manually here. + if hasattr(ec2, 'setTimer4cleanup'): + print "start setTimer4cleanup function in vmms" + ec2.setTimer4cleanup() + i = 0 while True: vm = TangoMachine(vmms="ec2SSH") From 0ba4baefe839f11a1a8b456ffa50396a2ea70ea0 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Wed, 9 Oct 2019 16:43:03 -0400 Subject: [PATCH 128/131] ec2SSH is activated from multiple tango services, probably unnecessarily. But the timed cleanup shouldn't run in each of them. Now it only runs in jobManager. --- vmms/ec2SSH.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/vmms/ec2SSH.py b/vmms/ec2SSH.py index 4785a976..bf86838d 100644 --- a/vmms/ec2SSH.py +++ b/vmms/ec2SSH.py @@ -3,6 +3,7 @@ # # ssh and scp to access them. +import __main__ import subprocess import os import re @@ -85,9 +86,10 @@ def __init__(self, accessKeyId=None, accessKey=None, ec2User=None): VM created """ + self.appName = os.path.basename(__main__.__file__).strip(".py") self.local_tz = pytz.timezone(config.Config.AUTODRIVER_LOGGING_TIME_ZONE) self.log = logging.getLogger("Ec2SSH-" + str(os.getpid())) - self.log.info("init Ec2SSH") + self.log.info("init Ec2SSH in program %s" % self.appName) self.ssh_flags = Ec2SSH._SSH_FLAGS self.ec2User = ec2User if ec2User else config.Config.EC2_USER_NAME @@ -132,10 +134,8 @@ def __init__(self, accessKeyId=None, accessKey=None, ec2User=None): self.log.info("Ignored images %s for lack of or ill-formed name tag" % str(ignoredAMIs)) - # start a timer to cleanup stale vms - t = Timer(60, self.cleanupUntaggedStaleVMs) - t.daemon = True # timer thread will not hold off process termination - t.start() + if self.appName == "jobManager": + self.setTimer4cleanup() # end of __init__ # @@ -562,6 +562,12 @@ def getImages(self): self.log.info("getImages: %s" % str(list(self.img2ami.keys()))) return list(self.img2ami.keys()) + def setTimer4cleanup(self): + # start a timer to cleanup stale vms + t = Timer(60, self.cleanupUntaggedStaleVMs) + t.daemon = True # timer thread will not hold off process termination + t.start() + def cleanupUntaggedStaleVMs(self): self.log.info("cleanupUntaggedStaleVMs") @@ -624,8 +630,5 @@ def cleanupUntaggedStaleVMs(self): except Exception as e: self.log.debug("cleanupUntaggedStaleVMs exception: %s" % e) - # set the next time interval - t = Timer(60, self.cleanupUntaggedStaleVMs) - t.daemon = True # timer thread will not hold off process termination - t.start() + self.setTimer4cleanup() # set the next timer interval # end of cleanupUntaggedStaleVMs() From d406a136ca3e001013b2c2ac1e33041047e867e8 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Fri, 18 Oct 2019 08:55:44 -0400 Subject: [PATCH 129/131] Remove tool's reference to redis and close redis's open door --- deployment/config/redis.conf | 2 +- tools/ec2Read.py | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/deployment/config/redis.conf b/deployment/config/redis.conf index 178688b0..f6992c06 100644 --- a/deployment/config/redis.conf +++ b/deployment/config/redis.conf @@ -33,7 +33,7 @@ ################################ GENERAL ##################################### ### JMB - allow access from outside the container (also added port in docker-compose.yml) -protected-mode no +protected-mode yes # By default Redis does not run as a daemon. Use 'yes' if you need it. # Note that Redis will write a pid file in /var/run/redis.pid when daemonized. diff --git a/tools/ec2Read.py b/tools/ec2Read.py index ed390923..eac13c04 100644 --- a/tools/ec2Read.py +++ b/tools/ec2Read.py @@ -6,9 +6,7 @@ from tangoObjects import TangoMachine from tango import TangoServer from config import Config -import tangoObjects import config_for_run_jobs -import redis import boto3 import pytz import argparse @@ -194,13 +192,6 @@ def destroyRedisPools(): # END of function definitions # -# When a host has two Tango containers (for experiment), there are two -# redis servers, too. They differ by the forwarding port number, which -# is defined in config_for_run_jobs.py. To select the redis server, -# We get the connection here and pass it into tangoObjects -redisConnection = redis.StrictRedis( - host=Config.REDIS_HOSTNAME, port=config_for_run_jobs.Config.redisHostPort, db=0) -tangoObjects.getRedisConnection(connection=redisConnection) boto3connection = boto3.client("ec2", Config.EC2_REGION) boto3resource = boto3.resource("ec2", Config.EC2_REGION) From 96c98306458f51de027637ce8c6433fe0afa0665 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Sat, 9 Nov 2019 16:50:17 -0500 Subject: [PATCH 130/131] dump the content of an email in 2018 in a file. --- major_fixes_by_PDL | 89 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 major_fixes_by_PDL diff --git a/major_fixes_by_PDL b/major_fixes_by_PDL new file mode 100644 index 00000000..51eddf45 --- /dev/null +++ b/major_fixes_by_PDL @@ -0,0 +1,89 @@ +Part 1 is a list of the major bugs fixes and improvements (some with +relevant commits), followed by Part 2, a list of new configuration +variables. Note that the commits may not be self-contained because +themselves may be buggy and have follow-up commits. They are here to +help understand the nature of the bugs and enhancements. + +Part 1. Bug fixes and enhancements + +The follow two bugs, combined, prevent pending jobs from being executed: +* When number of jobs is larger than number of vms in free pool, +jobManager dies. +* When jobManager restarts, free pool is not emptied whilst total pool +is, causing inconsistency. +https://github.com/xyzisinus/Tango/commit/4dcbbb4dfef096f3e64ef91f3eff4bf9d82b66b6 + +https://github.com/xyzisinus/Tango/commit/e2afe8a7d73bbd633282a35ec71ea690d2bb1db0 + + +* Add ability to specify image name for ec2 using "Name" tag on AMI +(used to allow only one image specified as DEFAULT_AMI): +https://github.com/xyzisinus/Tango/commit/97c22e39bcadf37b784cc2a0db5ea6202a5634ab + +https://github.com/xyzisinus/Tango/commit/e66551a53223b31c3baef74860eb845e4c2adac1 + + +* When job id reaches the max and wraps around, the jobs with larger ids +starve. +https://github.com/xyzisinus/Tango/commit/9565275dab5d0fa614b96b33bad642559f7714a4 + + +* Improve the worker's run() function to report errors on the +copy-in/exec/copy-out path more precisely. +https://github.com/xyzisinus/Tango/commit/caac9b46733716ed30feb62646d750a7accdd4f7 + +https://github.com/xyzisinus/Tango/commit/c47d8891a54f8cccef3ba4abd2938fa49c906dd1 + + +* In the original code, Tango allocates all vm instances allowed by +POOL_SIZE at once. It shouldn't be an issue because once a vm is made +ready a pending job should start using it. However, due to well-known +Python thread scheduling problems, the pending jobs will not run until +all vms are allocated. As we observed, vm allocations are almost +sequential although each allocation runs in a separate thread, again due +to Python's threading. That results in a long delay for the first job +to start running. To get around the problem, POOL_ALLOC_INCREMENT is +added to incrementally allocate vms and allow jobs to start running sooner. +https://github.com/xyzisinus/Tango/commit/93e60ada803514d4164237f5043bee95671259aa + + +* With POOL_SIZE_LOW_WATER_MARK, add the ability to shrink pool size +when there are extra vms in free pool. When low water mark is set to +zero, no vms are kept in free pool and a fresh vm is allocated for every +job and destroyed afterward. It is used to maintain desired number of +ec2 machines as standbys in the pool while terminating extra vms to save +money. +https://github.com/xyzisinus/Tango/commit/d896b360f6c8111a6be81df89bd43917519dd581 + +https://github.com/xyzisinus/Tango/commit/780557749cd14c272aad6a7ea4d5e04ff2ac18ed + + +* Improve autodriver with accurate error reporting and optional time +stamp insertion into job output. +Tango/autodriver/autodriver.c + +* When Tango restarts, vms in free pool are preserved (used to be all +destroyed). +https://github.com/xyzisinus/Tango/commit/e2afe8a7d73bbd633282a35ec71ea690d2bb1db0 + + +* Add run_jobs script to submit existing student handins in large numbers: +Tango/tools/run_jobs.py + +* Improve general logging by adding pid in logs and messages at critical +execution points. + +Part 2. New configuration variables (all optional) + +* Passed to autodriver to enhance readability of the output file. +Currently only integrated in ec2 vmms. +AUTODRIVER_LOGGING_TIME_ZONE +AUTODRIVER_TIMESTAMP_INTERVAL + +* Control of the preallocator pool as explained in Part 1. +POOL_SIZE_LOW_WATER_MARK +POOL_ALLOC_INCREMENT + +* Instead of destroying it, set the vm aside for further investigation +after autodriver returns OS ERROR. Currently only integrated in ec2 vmms. +KEEP_VM_AFTER_FAILURE From a0a74345786852edc1841f1af3ecf3abc2bc15a9 Mon Sep 17 00:00:00 2001 From: Xiaolin Charlene Zang Date: Sat, 9 Nov 2019 16:51:52 -0500 Subject: [PATCH 131/131] Add the timestamp in the "major fixes" file. --- major_fixes_by_PDL | 2 ++ 1 file changed, 2 insertions(+) diff --git a/major_fixes_by_PDL b/major_fixes_by_PDL index 51eddf45..ee2aa54d 100644 --- a/major_fixes_by_PDL +++ b/major_fixes_by_PDL @@ -1,3 +1,5 @@ +This the the content of a 2018 email. + Part 1 is a list of the major bugs fixes and improvements (some with relevant commits), followed by Part 2, a list of new configuration variables. Note that the commits may not be self-contained because