|
1 | 1 | #!/usr/bin/env python |
2 | 2 |
|
3 | | -from WMCore.Configuration import loadConfigurationFile |
4 | | -from WMCore.Database.CMSCouch import Database |
5 | | -from string import Template |
| 3 | +""" |
| 4 | +_cmst0_backlog_wma_ |
| 5 | +
|
| 6 | +Loof for created jobs by Workflow type in the Tier-0 |
| 7 | +If the creted jobs overall by workflow type exceeds a configured threshold |
| 8 | +then it alarms about it. |
| 9 | +
|
| 10 | +Availability metrics are defined as: |
| 11 | +
|
| 12 | +0 - There is backlog |
| 13 | +100 - There is no backlog |
| 14 | +""" |
| 15 | + |
| 16 | +import logging |
| 17 | +import threading |
| 18 | +import sys |
| 19 | +import re |
6 | 20 | import os |
7 | 21 | import time |
8 | 22 |
|
9 | | -# Load/set configuration : |
10 | | -config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) |
11 | | -alarmConfigPath = os.path.join(os.environ.get("SLS_CONFIG") or |
12 | | - os.environ["T0_ROOT"], 'etc/SLSAlarmsConfig.py') |
13 | | -alarmConfig = loadConfigurationFile(alarmConfigPath) |
14 | | -xmlFile = getattr(alarmConfig.cmst0_backlog_wma, "xmlFile", None) or "cmst0_backlog_wma.xml" |
15 | | -slsXml = os.path.join(alarmConfig.Settings.xmlDir, xmlFile) |
16 | | - |
17 | | -# Getting what we need from configuration |
18 | | -couchURL = config.JobStateMachine.couchurl |
19 | | -jobsDatabase = config.JobStateMachine.couchDBName + "/jobs" |
20 | | - |
21 | | -jobsDB = Database(jobsDatabase, couchURL) |
22 | | - |
23 | | -# This is the same as (example) : |
24 | | -# http://vocms15.cern.ch:5984/wmagent_jobdump%2Fjobs/_design/JobDump/_view/createdJobsByWorkflowName?group_level=1 |
25 | | -rows = jobsDB.loadView("JobDump", "createdJobsByWorkflowName", {"group_level" : 1}) |
26 | | - |
27 | | -# This will store how much created jobs we have on each type, created means in the system but still not submitted |
28 | | -jobCounts = { |
29 | | - "Express" : 0, |
30 | | - "Repack" : 0, |
31 | | - "PromptReco" : 0 |
32 | | -} |
33 | | - |
34 | | -for row in rows['rows']: |
35 | | - workflow = row['key'][0] |
36 | | - workflowType = workflow.split("_")[0] |
37 | | - createdJobs = row['value'] |
38 | | - |
39 | | - jobCounts[workflowType] += createdJobs |
40 | | - |
41 | | -# Here comes the TH logic, very simple in the beginning |
42 | | -availability = 100 |
43 | | -for workflowType in jobCounts: |
44 | | - limit = getattr(alarmConfig.cmst0_backlog_wma, workflowType) |
45 | | - if jobCounts[workflowType] > limit: |
46 | | - availability = 0 |
47 | | - |
48 | | -interventionInfo = {} |
49 | | -if hasattr(alarmConfig.cmst0_backlog_wma, "Intervention"): |
50 | | - startTime = alarmConfig.cmst0_backlog_wma.Intervention.startTime |
51 | | - duration = alarmConfig.cmst0_backlog_wma.Intervention.duration |
52 | | - message = alarmConfig.cmst0_backlog_wma.Intervention.message |
53 | | - |
54 | | - # Check that the intervention is present or in the future |
55 | | - structStartTime = time.strptime(startTime, "%Y-%m-%dT%H:%M:%S") |
56 | | - startTimeSeconds = time.mktime(structStartTime) |
57 | | - if (startTimeSeconds + duration * 3600) >= time.time(): |
58 | | - interventionInfo = {'startTime' : startTime, |
59 | | - 'duration' : duration, |
60 | | - 'message' : message} |
61 | | -intervention = "" |
62 | | -if interventionInfo: |
63 | | - inteventionTemplate = """ <interventions> |
64 | | - <intervention start="{startTime}" length="PT{duration}H"> |
65 | | - {message} |
66 | | - </intervention> |
67 | | - </interventions>""" |
68 | | - |
69 | | - intervention = inteventionTemplate.format(**interventionInfo) |
70 | | - |
71 | | -timezone = str(int(-time.timezone / 3600)).zfill(2) |
72 | | -timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+") |
73 | | -timestamp += "%s:00" % timezone |
74 | | -xmlValues = { |
75 | | - "availability" : availability, |
76 | | - "timestamp" : timestamp, |
77 | | - "promptreco_count": jobCounts["PromptReco"], |
78 | | - "express_count" : jobCounts["Express"], |
79 | | - "repack_count" : jobCounts["Repack"], |
80 | | - "intervention" : intervention |
81 | | -} |
82 | | - |
83 | | - |
84 | | -template = Template("""<?xml version="1.0" encoding="utf-8"?> |
85 | | - <serviceupdate xmlns="http://sls.cern.ch/SLS/XML/update"> |
86 | | - <id>cmst0-wma-backlog</id> |
87 | | - <availability>$availability</availability> |
88 | | - <timestamp>$timestamp</timestamp> |
89 | | - <data> |
90 | | - <numericvalue name="promptreco_count" desc="Backlogged PromptReco">$promptreco_count</numericvalue> |
91 | | - <numericvalue name="repack_count" desc="Backlogged Repack">$repack_count</numericvalue> |
92 | | - <numericvalue name="express_count" desc="Backlogged Express">$express_count</numericvalue> |
93 | | - </data> |
94 | | - $intervention |
95 | | - </serviceupdate> """) |
96 | | - |
97 | | -xmlUpdated = template.safe_substitute(xmlValues) |
98 | | -xml = open(slsXml, 'w') |
99 | | -xml.write(xmlUpdated) |
100 | | -xml.close() |
| 23 | +from WMCore.WMInit import connectToDB |
| 24 | +from WMCore.Database.DBFormatter import DBFormatter |
| 25 | +from WMCore.Configuration import loadConfigurationFile |
| 26 | + |
| 27 | +class CreatedJobsDAO(DBFormatter): |
| 28 | + """ |
| 29 | + DAO to extract information about running jobs from the database, |
| 30 | + it reports information about the time a job has been executing according |
| 31 | + to the records in BossAir, it also reports the type of job. |
| 32 | + It distinguishes between running and pending jobs. |
| 33 | + """ |
| 34 | + sql = """SELECT wmbs_workflow.name as workflow, COUNT(*) as jobs |
| 35 | + FROM wmbs_job |
| 36 | + INNER JOIN wmbs_jobgroup |
| 37 | + ON wmbs_job.jobgroup = wmbs_jobgroup.id |
| 38 | + INNER JOIN wmbs_subscription |
| 39 | + ON wmbs_subscription.id = wmbs_jobgroup.subscription |
| 40 | + INNER JOIN wmbs_workflow |
| 41 | + ON wmbs_workflow.id = wmbs_subscription.workflow |
| 42 | + WHERE wmbs_job.state = (SELECT id FROM wmbs_job_state |
| 43 | + WHERE name = 'created') |
| 44 | + GROUP BY wmbs_workflow.name |
| 45 | + """ |
| 46 | + |
| 47 | + def execute(self, conn = None, transaction = False): |
| 48 | + result = self.dbi.processData(self.sql, conn = conn, |
| 49 | + transaction = transaction) |
| 50 | + results = self.formatDict(result) |
| 51 | + |
| 52 | + return results |
| 53 | + |
| 54 | +def setup(): |
| 55 | + """ |
| 56 | + _setup_ |
| 57 | +
|
| 58 | + Perform any global setup operations. |
| 59 | + Setups the connection to the database and loads the alarm |
| 60 | + configuration. It returns the specific alarm configuration. |
| 61 | + """ |
| 62 | + connectToDB() |
| 63 | + configPath = os.path.join(os.environ.get("SLS_CONFIG") or |
| 64 | + os.environ["T0_ROOT"], "etc/SLSAlarmsConfig.py") |
| 65 | + # Load only the relevant alarm configuration but add anything from Settings |
| 66 | + fullConfig = loadConfigurationFile(configPath) |
| 67 | + config = getattr(fullConfig, "cmst0_backlog_wma") |
| 68 | + settings = getattr(fullConfig, "Settings") |
| 69 | + config.section_("Settings") |
| 70 | + config.Settings = settings |
| 71 | + return config |
| 72 | + |
| 73 | +def countJobs(): |
| 74 | + """ |
| 75 | + _countJobs_ |
| 76 | + |
| 77 | + Creates the summary of created jobs in the Tier-0 |
| 78 | + by workflow Type: Express, Repack and PromptReco |
| 79 | + """ |
| 80 | + myThread = threading.currentThread() |
| 81 | + retrieveInfo = CreatedJobsDAO(logger = logging, dbinterface = myThread.dbi) |
| 82 | + jobCountsByWorkflow = retrieveInfo.execute() |
| 83 | + jobCounts = { |
| 84 | + "Express" : 0, |
| 85 | + "Repack" : 0, |
| 86 | + "PromptReco" : 0 |
| 87 | + } |
| 88 | + for workflow in jobCountsByWorkflow: |
| 89 | + workflowType = workflow['workflow'].split("_")[0] |
| 90 | + createdJobs = workflow['jobs'] |
| 91 | + |
| 92 | + jobCounts[workflowType] += createdJobs |
| 93 | + |
| 94 | + return jobCounts |
| 95 | + |
| 96 | +def calculateAvailability(config, jobCountsByType): |
| 97 | + """ |
| 98 | + _calculateAvailability_ |
| 99 | +
|
| 100 | + Calculate the availability of the service |
| 101 | + according to the guidelines defined in the module |
| 102 | + documentation |
| 103 | + """ |
| 104 | + availability = 100 |
| 105 | + for workflowType in jobCountsByType: |
| 106 | + limit = getattr(config, workflowType) |
| 107 | + if jobCountsByType[workflowType] > limit: |
| 108 | + availability = 0 |
| 109 | + |
| 110 | + return availability |
| 111 | + |
| 112 | +def buildSLSXML(config, jobCountsByType, availability): |
| 113 | + """ |
| 114 | + _buildSLSXML_ |
| 115 | +
|
| 116 | + Builds an XML file for SLS updates based |
| 117 | + on the information in data. |
| 118 | + """ |
| 119 | + timezone = str(int(-time.timezone / 3600)).zfill(2) |
| 120 | + timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+") |
| 121 | + timestamp += "%s:00" % timezone |
| 122 | + |
| 123 | + # Retrieve the intervention info if any |
| 124 | + interventionInfo = {} |
| 125 | + if hasattr(config, "Intervention"): |
| 126 | + startTime = config.Intervention.startTime |
| 127 | + duration = config.Intervention.duration |
| 128 | + message = config.Intervention.message |
| 129 | + |
| 130 | + # Check that the intervention is present or in the future |
| 131 | + structStartTime = time.strptime(startTime, "%Y-%m-%dT%H:%M:%S") |
| 132 | + startTimeSeconds = time.mktime(structStartTime) |
| 133 | + if (startTimeSeconds + duration * 3600) >= time.time(): |
| 134 | + interventionInfo = {'startTime' : startTime, |
| 135 | + 'duration' : duration, |
| 136 | + 'message' : message} |
| 137 | + |
| 138 | + intervention = "" |
| 139 | + if interventionInfo: |
| 140 | + inteventionTemplate = """ <interventions> |
| 141 | + <intervention start="{startTime}" length="PT{duration}H"> |
| 142 | + {message} |
| 143 | + </intervention> |
| 144 | + </interventions>""" |
| 145 | + |
| 146 | + intervention = inteventionTemplate.format(**interventionInfo) |
| 147 | + |
| 148 | + jobsInfo = { |
| 149 | + "promptreco_count": jobCountsByType["PromptReco"], |
| 150 | + "express_count" : jobCountsByType["Express"], |
| 151 | + "repack_count" : jobCountsByType["Repack"], |
| 152 | + } |
| 153 | + |
| 154 | + dataTemplate = """ |
| 155 | + <numericvalue name="promptreco_count" desc="Backlogged PromptReco">{promptreco_count}</numericvalue> |
| 156 | + <numericvalue name="repack_count" desc="Backlogged Repack">{repack_count}</numericvalue> |
| 157 | + <numericvalue name="express_count" desc="Backlogged Express">{express_count}</numericvalue>""" |
| 158 | + data = dataTemplate.format(**jobsInfo) |
| 159 | + |
| 160 | + template = """<?xml version="1.0" encoding="utf-8"?> |
| 161 | + <serviceupdate> |
| 162 | + <id>CMST0-wma-backlog</id> |
| 163 | + <availability>{availability}</availability> |
| 164 | + <timestamp>{timestamp}</timestamp> |
| 165 | + <data> |
| 166 | + {data} |
| 167 | + </data> |
| 168 | +{intervention} |
| 169 | + </serviceupdate>\n""" |
| 170 | + |
| 171 | + xml = template.format(data = data, availability = availability, |
| 172 | + timestamp = timestamp, intervention = intervention) |
| 173 | + |
| 174 | + # Get the output file path |
| 175 | + xmlFile = getattr(config, "xmlFile", "cmst0_backlog_wma.xml") |
| 176 | + try: |
| 177 | + outputFile = open(os.path.join(config.Settings.xmlDir, xmlFile), 'w') |
| 178 | + outputFile.write(xml) |
| 179 | + except: |
| 180 | + print "Couldn't write the XML file" |
| 181 | + traceback.print_exc() |
| 182 | + finally: |
| 183 | + outputFile.close() |
| 184 | + |
| 185 | + return |
| 186 | + |
| 187 | +def main(): |
| 188 | + """ |
| 189 | + _main_ |
| 190 | + |
| 191 | + Script's main function |
| 192 | + """ |
| 193 | + try: |
| 194 | + # Check if the wmagent config file path exists in the environment |
| 195 | + if os.environ.has_key("config"): |
| 196 | + os.environ['WMAGENT_CONFIG'] = os.path.join(os.environ.get("config"), 'config.py') |
| 197 | + |
| 198 | + config = setup() |
| 199 | + jobCountsByType = countJobs() |
| 200 | + availability = calculateAvailability(config, jobCountsByType) |
| 201 | + buildSLSXML(config, jobCountsByType, availability) |
| 202 | + return 0 |
| 203 | + |
| 204 | + except Exception, e: |
| 205 | + timezone = str(int(-time.timezone / 3600)).zfill(2) |
| 206 | + timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+") |
| 207 | + timestamp += "%s:00" % timezone |
| 208 | + sys.stderr.write('\n'+str(timestamp)+'\n') |
| 209 | + raise e |
| 210 | + |
| 211 | +if __name__ == "__main__": |
| 212 | + sys.exit(main()) |
0 commit comments