diff --git a/sio/sioworkersd/scheduler/__init__.py b/sio/sioworkersd/scheduler/__init__.py index 370ed27..bb79a6f 100644 --- a/sio/sioworkersd/scheduler/__init__.py +++ b/sio/sioworkersd/scheduler/__init__.py @@ -34,6 +34,11 @@ def schedule(self): (task_id, worker_id).""" raise NotImplementedError() + def dump(self): + """Return a dict representation of the scheduler state""" + # use string representation as a fallback + return unicode(self) + def getDefaultSchedulerClassName(): return 'sio.sioworkersd.scheduler.prioritizing.PrioritizingScheduler' diff --git a/sio/sioworkersd/scheduler/prioritizing.py b/sio/sioworkersd/scheduler/prioritizing.py index 0aca465..972e8e7 100644 --- a/sio/sioworkersd/scheduler/prioritizing.py +++ b/sio/sioworkersd/scheduler/prioritizing.py @@ -51,6 +51,9 @@ from sio.sioworkersd.scheduler import Scheduler from sio.sioworkersd.utils import get_required_ram_for_job +from twisted.logger import Logger, LogLevel + +log = Logger() class _WaitingTasksQueue(object): @@ -100,6 +103,12 @@ def popleft(self): def getTasksRequiredRam(self): return self._tasks_required_ram + def __repr__(self): + return '<_WaitingTasksQueue %r>' % self._dict.keys() + + def __iter__(self): + return iter(self._dict.keys()) + class WorkerInfo(object): """A class responsible for tracking state of a single worker. @@ -202,6 +211,16 @@ def __init__(self, env, contest): # Mutable data self.assigned_worker = None + def __repr__(self): + return '' % (self.id, + 'realCPU' if self.real_cpu else 'vCPU', self.priority, + self.contest, self.assigned_worker) + + def dump(self): + return {'id': self.id, 'real_cpu': self.real_cpu, + 'priority': self.priority, 'contest': self.contest.dump(), + 'assigned_worker': self.assigned_worker.id if self.assigned_worker else None} + class ContestInfo(object): """Tracks priority and weight of a contest. @@ -222,6 +241,14 @@ def __init__(self, contest_uid, priority, weight): self.priority = priority self.weight = weight + def __repr__(self): + return '' % (self.uid, + self.priority, self.weight) + + def dump(self): + return {'uid': self.uid, 'priority': self.priority, + 'weight': self.weight} + class TasksQueues(object): """Per-contest priority queues of tasks. @@ -298,6 +325,12 @@ def chooseTask(self): return self.queues[best_contest][-1] + def __repr__(self): + return '' % self.queues + + def dump(self): + return {('%s:%s' % k.uid): [x.dump() for x in v] for k, v in self.queues.items()} + class PrioritizingScheduler(Scheduler): """The prioritizing scheduler main class, implementing scheduler interface. @@ -356,6 +389,10 @@ def __unicode__(self): """ return unicode((self.tasks_queues, self.waiting_real_cpu_tasks)) + def dump(self): + return {'tasks_queues': {k: v.dump() for k,v in self.tasks_queues.items()}, + 'waiting_real_cpu_tasks': [x.dump() for x in self.waiting_real_cpu_tasks]} + # Worker scheduling def _insertWorkerToQueue(self, worker): @@ -370,6 +407,7 @@ def _removeWorkerFromQueue(self, worker): def addWorker(self, worker_id): """Will be called when a new worker appears.""" + log.warn("addWorker, workers {}".format(len(self.workers))) worker = WorkerInfo(worker_id, self.manager.getWorkers()[worker_id]) self.workers[worker_id] = worker @@ -659,10 +697,22 @@ def schedule(self): """Return a list of tasks to be executed now, as a list of pairs (task_id, worker_id). """ + if self.tasks: + log.warn("{} tasks availible, tasks queues:".format(len(self.tasks))) + for q, v in self.tasks_queues.iteritems(): + log.warn(" {} {}".format(q, bool(v))) + log.warn("workers {}, queues:".format(len(self.workers))) + for q, v in self.workers_queues.iteritems(): + log.warn(" {} {}".format(q, len(v))) + result = [] while True: association = self._scheduleOnce() if association is None: break result.append(association) + + if result: + log.warn("{} tasks scheduled".format(len(result))) + return result diff --git a/sio/sioworkersd/siorpc.py b/sio/sioworkersd/siorpc.py index 947303a..e74abeb 100644 --- a/sio/sioworkersd/siorpc.py +++ b/sio/sioworkersd/siorpc.py @@ -44,6 +44,9 @@ def xmlrpc_get_workers(self): def xmlrpc_get_queue(self): return self.taskm.getQueue() + def xmlrpc_get_tasks(self): + return self.taskm.getTasks() + def _prepare_group(self, env): tasks = env['workers_jobs'] group_id = 'GROUP_' + uuid4().urn diff --git a/sio/sioworkersd/taskmanager.py b/sio/sioworkersd/taskmanager.py index b0b4d65..9f7182f 100644 --- a/sio/sioworkersd/taskmanager.py +++ b/sio/sioworkersd/taskmanager.py @@ -36,7 +36,7 @@ def __init__(self, desc, excs): s = desc + '\n\n' l = [] for (e, tb) in excs: - l.append("Exception: %s\n%s" % (str(e), tb)) + l.append((u"Exception: %s\n%s" % (e, tb)).encode('utf-8')) s += ('='*80 + '\n').join(l) super(MultiException, self).__init__(s) @@ -62,12 +62,23 @@ def restart_db_sync_task(failure, task): task=self.db_sync_task) def get_items(self): - return [json.loads(self.db[k]) for k in self.db.keys()] + items = [] + error = [] + for k in self.db.keys(): + try: + items.append(json.loads(self.db[k])) + except: + error.append(k) + log.error("Failed to decode {key}", key=k) + for k in error: + log.error("Removing {key}", key=k) + #del self.db[k] + return items def update(self, job_id, dict_update, sync=True): - job = json.loads(self.db.get(job_id, '{}')) + job = json.loads(self.db.get(str(job_id), '{}')) job.update(dict_update) - self.db[job_id] = json.dumps(job) + self.db[str(job_id)] = json.dumps(job) if sync: self.db.sync() @@ -100,18 +111,33 @@ def startService(self): if len(all_jobs) > 0: log.info("Unfinished jobs found in database, resuming them...") + return_old_task_concurrency = 16 + jobs_to_return = [ [] for _ in range(return_old_task_concurrency) ] + j = 0 + for job in all_jobs: if job['status'] == 'to_judge': d = self._addGroup(job['env']) log.debug("added again unfinished task {tid}", tid=job['id']) d.addBoth(self.returnToSio, url=job['env']['return_url'], - orig_env=job['env'], tid=job['id']) + orig_env=job['env'], tid=str(job['id'])) elif job['status'] == 'to_return': - log.warn("Trying again to return old task {tid}", - tid=job['id']) - self.returnToSio(job['env'], url=job['env']['return_url'], - orig_env=job['env'], tid=job['id'], - count=job['retry_cnt']) + jobs_to_return[j].append(job) + j = (j + 1) % return_old_task_concurrency + + for i in range(return_old_task_concurrency): + log.warn("Returning {n} tasks", n=len(jobs_to_return[i])) + def return_old_task(x, i, jobs): + if len(jobs) != 0: + job = jobs.pop() + log.warn("Trying again to return old task {tid} from {qid}", + tid=job['id'], qid=i) + d = self.returnToSio(job['env'], url=job['env']['return_url'], + orig_env=job['env'], tid=str(job['id']), + count=job['retry_cnt']) + d.addBoth(return_old_task, i=i, jobs=jobs) + return_old_task(None, i=i, jobs=jobs_to_return[i]) + self.workerm.notifyOnNewWorker(self._newWorker) self.workerm.notifyOnLostWorker(self._lostWorker) self._tryExecute() @@ -129,6 +155,9 @@ def _tryExecute(self, x=None): # a performance problem for complex schedulers, especially during # rejudges. A solution exists, but it is a bit complex. jobs = self.scheduler.schedule() + if len(jobs) > 0: + log.warn("jobs: {}, inProgress: {}".format(len(jobs), len(self.inProgress))) + for (task_id, worker) in jobs: task = self.inProgress[task_id] d = self.workerm.runOnWorker(worker, task.env) @@ -152,6 +181,7 @@ def _retry_on_disconnect(failure, task_id=task_id, task=task): return x def _taskDone(self, x, tid): + tid = str(tid) if isinstance(x, Failure): self.inProgress[tid].env['error'] = { 'message': x.getErrorMessage(), @@ -179,16 +209,21 @@ def _taskDone(self, x, tid): def _deferTask(self, env): tid = env['task_id'] + tid = str(tid) if tid in self.inProgress: raise RuntimeError('Tried to add same task twice') d = defer.Deferred() + log.warn("adding task, inProgress {}".format(len(self.inProgress))) self.inProgress[tid] = Task(env=env, d=d) d.addBoth(self._taskDone, tid=tid) return d def getQueue(self): - return unicode(self.scheduler) + return self.scheduler.dump() + + def getTasks(self): + return {k: t.env for k, t in self.inProgress.iteritems()} def _addGroup(self, group_env): singleTasks = [] @@ -330,7 +365,7 @@ def retry(err, retry_cnt): return ret def _returnDone(self, _, tid): - self.database.delete(tid, sync=False) + self.database.delete(str(tid), sync=False) # No db sync here, because we are allowing some jobs to be done # multiple times in case of server failure for better performance. # It should be synced soon with other task diff --git a/sio/sioworkersd/utils.py b/sio/sioworkersd/utils.py index 6c60a63..e993403 100644 --- a/sio/sioworkersd/utils.py +++ b/sio/sioworkersd/utils.py @@ -7,7 +7,7 @@ 'inwer': 256 * 1024, 'compile': 512 * 1024, 'exec': 64 * 1024, - 'checker': 256 * 1024, + 'checker': 268 * 1024, 'default': 256 * 1024, }