Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7af848c

Browse files
committed
Merge pull request #4111 from lucacopa/sls_alarm_update
Update tier0 sls alarms scripts
2 parents 686e72c + bca6e84 commit 7af848c

File tree

6 files changed

+367
-196
lines changed

6 files changed

+367
-196
lines changed

bin/cmst0_backlog_wma

Lines changed: 207 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1,100 +1,212 @@
11
#!/usr/bin/env python
22

3-
from WMCore.Configuration import loadConfigurationFile
4-
from WMCore.Database.CMSCouch import Database
5-
from string import Template
3+
"""
4+
_cmst0_backlog_wma_
5+
6+
Loof for created jobs by Workflow type in the Tier-0
7+
If the creted jobs overall by workflow type exceeds a configured threshold
8+
then it alarms about it.
9+
10+
Availability metrics are defined as:
11+
12+
0 - There is backlog
13+
100 - There is no backlog
14+
"""
15+
16+
import logging
17+
import threading
18+
import sys
19+
import re
620
import os
721
import time
822

9-
# Load/set configuration :
10-
config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"])
11-
alarmConfigPath = os.path.join(os.environ.get("SLS_CONFIG") or
12-
os.environ["T0_ROOT"], 'etc/SLSAlarmsConfig.py')
13-
alarmConfig = loadConfigurationFile(alarmConfigPath)
14-
xmlFile = getattr(alarmConfig.cmst0_backlog_wma, "xmlFile", None) or "cmst0_backlog_wma.xml"
15-
slsXml = os.path.join(alarmConfig.Settings.xmlDir, xmlFile)
16-
17-
# Getting what we need from configuration
18-
couchURL = config.JobStateMachine.couchurl
19-
jobsDatabase = config.JobStateMachine.couchDBName + "/jobs"
20-
21-
jobsDB = Database(jobsDatabase, couchURL)
22-
23-
# This is the same as (example) :
24-
# http://vocms15.cern.ch:5984/wmagent_jobdump%2Fjobs/_design/JobDump/_view/createdJobsByWorkflowName?group_level=1
25-
rows = jobsDB.loadView("JobDump", "createdJobsByWorkflowName", {"group_level" : 1})
26-
27-
# This will store how much created jobs we have on each type, created means in the system but still not submitted
28-
jobCounts = {
29-
"Express" : 0,
30-
"Repack" : 0,
31-
"PromptReco" : 0
32-
}
33-
34-
for row in rows['rows']:
35-
workflow = row['key'][0]
36-
workflowType = workflow.split("_")[0]
37-
createdJobs = row['value']
38-
39-
jobCounts[workflowType] += createdJobs
40-
41-
# Here comes the TH logic, very simple in the beginning
42-
availability = 100
43-
for workflowType in jobCounts:
44-
limit = getattr(alarmConfig.cmst0_backlog_wma, workflowType)
45-
if jobCounts[workflowType] > limit:
46-
availability = 0
47-
48-
interventionInfo = {}
49-
if hasattr(alarmConfig.cmst0_backlog_wma, "Intervention"):
50-
startTime = alarmConfig.cmst0_backlog_wma.Intervention.startTime
51-
duration = alarmConfig.cmst0_backlog_wma.Intervention.duration
52-
message = alarmConfig.cmst0_backlog_wma.Intervention.message
53-
54-
# Check that the intervention is present or in the future
55-
structStartTime = time.strptime(startTime, "%Y-%m-%dT%H:%M:%S")
56-
startTimeSeconds = time.mktime(structStartTime)
57-
if (startTimeSeconds + duration * 3600) >= time.time():
58-
interventionInfo = {'startTime' : startTime,
59-
'duration' : duration,
60-
'message' : message}
61-
intervention = ""
62-
if interventionInfo:
63-
inteventionTemplate = """ <interventions>
64-
<intervention start="{startTime}" length="PT{duration}H">
65-
{message}
66-
</intervention>
67-
</interventions>"""
68-
69-
intervention = inteventionTemplate.format(**interventionInfo)
70-
71-
timezone = str(int(-time.timezone / 3600)).zfill(2)
72-
timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+")
73-
timestamp += "%s:00" % timezone
74-
xmlValues = {
75-
"availability" : availability,
76-
"timestamp" : timestamp,
77-
"promptreco_count": jobCounts["PromptReco"],
78-
"express_count" : jobCounts["Express"],
79-
"repack_count" : jobCounts["Repack"],
80-
"intervention" : intervention
81-
}
82-
83-
84-
template = Template("""<?xml version="1.0" encoding="utf-8"?>
85-
<serviceupdate xmlns="http://sls.cern.ch/SLS/XML/update">
86-
<id>cmst0-wma-backlog</id>
87-
<availability>$availability</availability>
88-
<timestamp>$timestamp</timestamp>
89-
<data>
90-
<numericvalue name="promptreco_count" desc="Backlogged PromptReco">$promptreco_count</numericvalue>
91-
<numericvalue name="repack_count" desc="Backlogged Repack">$repack_count</numericvalue>
92-
<numericvalue name="express_count" desc="Backlogged Express">$express_count</numericvalue>
93-
</data>
94-
$intervention
95-
</serviceupdate> """)
96-
97-
xmlUpdated = template.safe_substitute(xmlValues)
98-
xml = open(slsXml, 'w')
99-
xml.write(xmlUpdated)
100-
xml.close()
23+
from WMCore.WMInit import connectToDB
24+
from WMCore.Database.DBFormatter import DBFormatter
25+
from WMCore.Configuration import loadConfigurationFile
26+
27+
class CreatedJobsDAO(DBFormatter):
28+
"""
29+
DAO to extract information about running jobs from the database,
30+
it reports information about the time a job has been executing according
31+
to the records in BossAir, it also reports the type of job.
32+
It distinguishes between running and pending jobs.
33+
"""
34+
sql = """SELECT wmbs_workflow.name as workflow, COUNT(*) as jobs
35+
FROM wmbs_job
36+
INNER JOIN wmbs_jobgroup
37+
ON wmbs_job.jobgroup = wmbs_jobgroup.id
38+
INNER JOIN wmbs_subscription
39+
ON wmbs_subscription.id = wmbs_jobgroup.subscription
40+
INNER JOIN wmbs_workflow
41+
ON wmbs_workflow.id = wmbs_subscription.workflow
42+
WHERE wmbs_job.state = (SELECT id FROM wmbs_job_state
43+
WHERE name = 'created')
44+
GROUP BY wmbs_workflow.name
45+
"""
46+
47+
def execute(self, conn = None, transaction = False):
48+
result = self.dbi.processData(self.sql, conn = conn,
49+
transaction = transaction)
50+
results = self.formatDict(result)
51+
52+
return results
53+
54+
def setup():
55+
"""
56+
_setup_
57+
58+
Perform any global setup operations.
59+
Setups the connection to the database and loads the alarm
60+
configuration. It returns the specific alarm configuration.
61+
"""
62+
connectToDB()
63+
configPath = os.path.join(os.environ.get("SLS_CONFIG") or
64+
os.environ["T0_ROOT"], "etc/SLSAlarmsConfig.py")
65+
# Load only the relevant alarm configuration but add anything from Settings
66+
fullConfig = loadConfigurationFile(configPath)
67+
config = getattr(fullConfig, "cmst0_backlog_wma")
68+
settings = getattr(fullConfig, "Settings")
69+
config.section_("Settings")
70+
config.Settings = settings
71+
return config
72+
73+
def countJobs():
74+
"""
75+
_countJobs_
76+
77+
Creates the summary of created jobs in the Tier-0
78+
by workflow Type: Express, Repack and PromptReco
79+
"""
80+
myThread = threading.currentThread()
81+
retrieveInfo = CreatedJobsDAO(logger = logging, dbinterface = myThread.dbi)
82+
jobCountsByWorkflow = retrieveInfo.execute()
83+
jobCounts = {
84+
"Express" : 0,
85+
"Repack" : 0,
86+
"PromptReco" : 0
87+
}
88+
for workflow in jobCountsByWorkflow:
89+
workflowType = workflow['workflow'].split("_")[0]
90+
createdJobs = workflow['jobs']
91+
92+
jobCounts[workflowType] += createdJobs
93+
94+
return jobCounts
95+
96+
def calculateAvailability(config, jobCountsByType):
97+
"""
98+
_calculateAvailability_
99+
100+
Calculate the availability of the service
101+
according to the guidelines defined in the module
102+
documentation
103+
"""
104+
availability = 100
105+
for workflowType in jobCountsByType:
106+
limit = getattr(config, workflowType)
107+
if jobCountsByType[workflowType] > limit:
108+
availability = 0
109+
110+
return availability
111+
112+
def buildSLSXML(config, jobCountsByType, availability):
113+
"""
114+
_buildSLSXML_
115+
116+
Builds an XML file for SLS updates based
117+
on the information in data.
118+
"""
119+
timezone = str(int(-time.timezone / 3600)).zfill(2)
120+
timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+")
121+
timestamp += "%s:00" % timezone
122+
123+
# Retrieve the intervention info if any
124+
interventionInfo = {}
125+
if hasattr(config, "Intervention"):
126+
startTime = config.Intervention.startTime
127+
duration = config.Intervention.duration
128+
message = config.Intervention.message
129+
130+
# Check that the intervention is present or in the future
131+
structStartTime = time.strptime(startTime, "%Y-%m-%dT%H:%M:%S")
132+
startTimeSeconds = time.mktime(structStartTime)
133+
if (startTimeSeconds + duration * 3600) >= time.time():
134+
interventionInfo = {'startTime' : startTime,
135+
'duration' : duration,
136+
'message' : message}
137+
138+
intervention = ""
139+
if interventionInfo:
140+
inteventionTemplate = """ <interventions>
141+
<intervention start="{startTime}" length="PT{duration}H">
142+
{message}
143+
</intervention>
144+
</interventions>"""
145+
146+
intervention = inteventionTemplate.format(**interventionInfo)
147+
148+
jobsInfo = {
149+
"promptreco_count": jobCountsByType["PromptReco"],
150+
"express_count" : jobCountsByType["Express"],
151+
"repack_count" : jobCountsByType["Repack"],
152+
}
153+
154+
dataTemplate = """
155+
<numericvalue name="promptreco_count" desc="Backlogged PromptReco">{promptreco_count}</numericvalue>
156+
<numericvalue name="repack_count" desc="Backlogged Repack">{repack_count}</numericvalue>
157+
<numericvalue name="express_count" desc="Backlogged Express">{express_count}</numericvalue>"""
158+
data = dataTemplate.format(**jobsInfo)
159+
160+
template = """<?xml version="1.0" encoding="utf-8"?>
161+
<serviceupdate>
162+
<id>CMST0-wma-backlog</id>
163+
<availability>{availability}</availability>
164+
<timestamp>{timestamp}</timestamp>
165+
<data>
166+
{data}
167+
</data>
168+
{intervention}
169+
</serviceupdate>\n"""
170+
171+
xml = template.format(data = data, availability = availability,
172+
timestamp = timestamp, intervention = intervention)
173+
174+
# Get the output file path
175+
xmlFile = getattr(config, "xmlFile", "cmst0_backlog_wma.xml")
176+
try:
177+
outputFile = open(os.path.join(config.Settings.xmlDir, xmlFile), 'w')
178+
outputFile.write(xml)
179+
except:
180+
print "Couldn't write the XML file"
181+
traceback.print_exc()
182+
finally:
183+
outputFile.close()
184+
185+
return
186+
187+
def main():
188+
"""
189+
_main_
190+
191+
Script's main function
192+
"""
193+
try:
194+
# Check if the wmagent config file path exists in the environment
195+
if os.environ.has_key("config"):
196+
os.environ['WMAGENT_CONFIG'] = os.path.join(os.environ.get("config"), 'config.py')
197+
198+
config = setup()
199+
jobCountsByType = countJobs()
200+
availability = calculateAvailability(config, jobCountsByType)
201+
buildSLSXML(config, jobCountsByType, availability)
202+
return 0
203+
204+
except Exception, e:
205+
timezone = str(int(-time.timezone / 3600)).zfill(2)
206+
timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+")
207+
timestamp += "%s:00" % timezone
208+
sys.stderr.write('\n'+str(timestamp)+'\n')
209+
raise e
210+
211+
if __name__ == "__main__":
212+
sys.exit(main())

0 commit comments

Comments
 (0)