Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 06f5d9b

Browse files
committed
Updates to tier0 sls alarms scripts
Also updates an old query in the diagnoseActiveRuns script
1 parent 01d038e commit 06f5d9b

File tree

6 files changed

+261
-135
lines changed

6 files changed

+261
-135
lines changed

bin/cmst0_backlog_wma

Lines changed: 199 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1,100 +1,204 @@
11
#!/usr/bin/env python
22

3-
from WMCore.Configuration import loadConfigurationFile
4-
from WMCore.Database.CMSCouch import Database
5-
from string import Template
3+
"""
4+
_cmst0_backlog_wma_
5+
6+
Loof for created jobs by Workflow type in the Tier-0
7+
If the creted jobs overall by workflow type exceeds a configured threshold
8+
then it alarms about it.
9+
10+
Availability metrics are defined as:
11+
12+
0 - There is backlog
13+
100 - There is no backlog
14+
"""
15+
16+
import logging
17+
import threading
18+
import sys
19+
import re
620
import os
721
import time
822

9-
# Load/set configuration :
10-
config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"])
11-
alarmConfigPath = os.path.join(os.environ.get("SLS_CONFIG") or
12-
os.environ["T0_ROOT"], 'etc/SLSAlarmsConfig.py')
13-
alarmConfig = loadConfigurationFile(alarmConfigPath)
14-
xmlFile = getattr(alarmConfig.cmst0_backlog_wma, "xmlFile", None) or "cmst0_backlog_wma.xml"
15-
slsXml = os.path.join(alarmConfig.Settings.xmlDir, xmlFile)
16-
17-
# Getting what we need from configuration
18-
couchURL = config.JobStateMachine.couchurl
19-
jobsDatabase = config.JobStateMachine.couchDBName + "/jobs"
20-
21-
jobsDB = Database(jobsDatabase, couchURL)
22-
23-
# This is the same as (example) :
24-
# http://vocms15.cern.ch:5984/wmagent_jobdump%2Fjobs/_design/JobDump/_view/createdJobsByWorkflowName?group_level=1
25-
rows = jobsDB.loadView("JobDump", "createdJobsByWorkflowName", {"group_level" : 1})
26-
27-
# This will store how much created jobs we have on each type, created means in the system but still not submitted
28-
jobCounts = {
29-
"Express" : 0,
30-
"Repack" : 0,
31-
"PromptReco" : 0
32-
}
33-
34-
for row in rows['rows']:
35-
workflow = row['key'][0]
36-
workflowType = workflow.split("_")[0]
37-
createdJobs = row['value']
38-
39-
jobCounts[workflowType] += createdJobs
40-
41-
# Here comes the TH logic, very simple in the beginning
42-
availability = 100
43-
for workflowType in jobCounts:
44-
limit = getattr(alarmConfig.cmst0_backlog_wma, workflowType)
45-
if jobCounts[workflowType] > limit:
46-
availability = 0
47-
48-
interventionInfo = {}
49-
if hasattr(alarmConfig.cmst0_backlog_wma, "Intervention"):
50-
startTime = alarmConfig.cmst0_backlog_wma.Intervention.startTime
51-
duration = alarmConfig.cmst0_backlog_wma.Intervention.duration
52-
message = alarmConfig.cmst0_backlog_wma.Intervention.message
53-
54-
# Check that the intervention is present or in the future
55-
structStartTime = time.strptime(startTime, "%Y-%m-%dT%H:%M:%S")
56-
startTimeSeconds = time.mktime(structStartTime)
57-
if (startTimeSeconds + duration * 3600) >= time.time():
58-
interventionInfo = {'startTime' : startTime,
59-
'duration' : duration,
60-
'message' : message}
61-
intervention = ""
62-
if interventionInfo:
63-
inteventionTemplate = """ <interventions>
64-
<intervention start="{startTime}" length="PT{duration}H">
65-
{message}
66-
</intervention>
67-
</interventions>"""
68-
69-
intervention = inteventionTemplate.format(**interventionInfo)
70-
71-
timezone = str(int(-time.timezone / 3600)).zfill(2)
72-
timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+")
73-
timestamp += "%s:00" % timezone
74-
xmlValues = {
75-
"availability" : availability,
76-
"timestamp" : timestamp,
77-
"promptreco_count": jobCounts["PromptReco"],
78-
"express_count" : jobCounts["Express"],
79-
"repack_count" : jobCounts["Repack"],
80-
"intervention" : intervention
81-
}
82-
83-
84-
template = Template("""<?xml version="1.0" encoding="utf-8"?>
85-
<serviceupdate xmlns="http://sls.cern.ch/SLS/XML/update">
86-
<id>cmst0-wma-backlog</id>
87-
<availability>$availability</availability>
88-
<timestamp>$timestamp</timestamp>
89-
<data>
90-
<numericvalue name="promptreco_count" desc="Backlogged PromptReco">$promptreco_count</numericvalue>
91-
<numericvalue name="repack_count" desc="Backlogged Repack">$repack_count</numericvalue>
92-
<numericvalue name="express_count" desc="Backlogged Express">$express_count</numericvalue>
93-
</data>
94-
$intervention
95-
</serviceupdate> """)
96-
97-
xmlUpdated = template.safe_substitute(xmlValues)
98-
xml = open(slsXml, 'w')
99-
xml.write(xmlUpdated)
100-
xml.close()
23+
from WMCore.WMInit import connectToDB
24+
from WMCore.Database.DBFormatter import DBFormatter
25+
from WMCore.Configuration import loadConfigurationFile
26+
27+
class CreatedJobsDAO(DBFormatter):
28+
"""
29+
DAO to extract information about running jobs from the database,
30+
it reports information about the time a job has been executing according
31+
to the records in BossAir, it also reports the type of job.
32+
It distinguishes between running and pending jobs.
33+
"""
34+
sql = """SELECT wmbs_workflow.name as workflow, COUNT(*) as jobs
35+
FROM wmbs_job
36+
INNER JOIN wmbs_jobgroup
37+
ON wmbs_job.jobgroup = wmbs_jobgroup.id
38+
INNER JOIN wmbs_subscription
39+
ON wmbs_subscription.id = wmbs_jobgroup.subscription
40+
INNER JOIN wmbs_workflow
41+
ON wmbs_workflow.id = wmbs_subscription.workflow
42+
WHERE wmbs_job.state = (SELECT id FROM wmbs_job_state
43+
WHERE name = 'created')
44+
GROUP BY wmbs_workflow.name
45+
"""
46+
47+
def execute(self, conn = None, transaction = False):
48+
result = self.dbi.processData(self.sql, conn = conn,
49+
transaction = transaction)
50+
results = self.formatDict(result)
51+
52+
return results
53+
54+
def setup():
55+
"""
56+
_setup_
57+
58+
Perform any global setup operations.
59+
Setups the connection to the database and loads the alarm
60+
configuration. It returns the specific alarm configuration.
61+
"""
62+
connectToDB()
63+
configPath = os.path.join(os.environ.get("SLS_CONFIG") or
64+
os.environ["T0_ROOT"], "etc/SLSAlarmsConfig.py")
65+
# Load only the relevant alarm configuration but add anything from Settings
66+
fullConfig = loadConfigurationFile(configPath)
67+
config = getattr(fullConfig, "cmst0_backlog_wma")
68+
settings = getattr(fullConfig, "Settings")
69+
config.section_("Settings")
70+
config.Settings = settings
71+
return config
72+
73+
def countJobs():
74+
"""
75+
_countJobs_
76+
77+
Creates the summary of created jobs in the Tier-0
78+
by workflow Type: Express, Repack and PromptReco
79+
"""
80+
myThread = threading.currentThread()
81+
retrieveInfo = CreatedJobsDAO(logger = logging, dbinterface = myThread.dbi)
82+
jobCountsByWorkflow = retrieveInfo.execute()
83+
jobCounts = {
84+
"Express" : 0,
85+
"Repack" : 0,
86+
"PromptReco" : 0
87+
}
88+
for workflow in jobCountsByWorkflow:
89+
workflowType = workflow['workflow'].split("_")[0]
90+
createdJobs = workflow['jobs']
91+
92+
jobCounts[workflowType] += createdJobs
93+
94+
return jobCounts
95+
96+
def calculateAvailability(config, jobCountsByType):
97+
"""
98+
_calculateAvailability_
99+
100+
Calculate the availability of the service
101+
according to the guidelines defined in the module
102+
documentation
103+
"""
104+
availability = 100
105+
for workflowType in jobCountsByType:
106+
limit = getattr(config, workflowType)
107+
if jobCountsByType[workflowType] > limit:
108+
availability = 0
109+
110+
return availability
111+
112+
def buildSLSXML(config, jobCountsByType, availability):
113+
"""
114+
_buildSLSXML_
115+
116+
Builds an XML file for SLS updates based
117+
on the information in data.
118+
"""
119+
timezone = str(int(-time.timezone / 3600)).zfill(2)
120+
timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+")
121+
timestamp += "%s:00" % timezone
122+
123+
# Retrieve the intervention info if any
124+
interventionInfo = {}
125+
if hasattr(config, "Intervention"):
126+
startTime = config.Intervention.startTime
127+
duration = config.Intervention.duration
128+
message = config.Intervention.message
129+
130+
# Check that the intervention is present or in the future
131+
structStartTime = time.strptime(startTime, "%Y-%m-%dT%H:%M:%S")
132+
startTimeSeconds = time.mktime(structStartTime)
133+
if (startTimeSeconds + duration * 3600) >= time.time():
134+
interventionInfo = {'startTime' : startTime,
135+
'duration' : duration,
136+
'message' : message}
137+
138+
intervention = ""
139+
if interventionInfo:
140+
inteventionTemplate = """ <interventions>
141+
<intervention start="{startTime}" length="PT{duration}H">
142+
{message}
143+
</intervention>
144+
</interventions>"""
145+
146+
intervention = inteventionTemplate.format(**interventionInfo)
147+
148+
jobsInfo = {
149+
"promptreco_count": jobCountsByType["PromptReco"],
150+
"express_count" : jobCountsByType["Express"],
151+
"repack_count" : jobCountsByType["Repack"],
152+
}
153+
154+
dataTemplate = """
155+
<numericvalue name="promptreco_count" desc="Backlogged PromptReco">{promptreco_count}</numericvalue>
156+
<numericvalue name="repack_count" desc="Backlogged Repack">{repack_count}</numericvalue>
157+
<numericvalue name="express_count" desc="Backlogged Express">{express_count}</numericvalue>"""
158+
data = dataTemplate.format(**jobsInfo)
159+
160+
template = """<?xml version="1.0" encoding="utf-8"?>
161+
<serviceupdate>
162+
<id>CMST0-wma-backlog</id>
163+
<availability>{availability}</availability>
164+
<timestamp>{timestamp}</timestamp>
165+
<data>
166+
{data}
167+
</data>
168+
{intervention}
169+
</serviceupdate>\n"""
170+
171+
xml = template.format(data = data, availability = availability,
172+
timestamp = timestamp, intervention = intervention)
173+
174+
# Get the output file path
175+
xmlFile = getattr(config, "xmlFile", "cmst0_backlog_wma.xml")
176+
try:
177+
outputFile = open(os.path.join(config.Settings.xmlDir, xmlFile), 'w')
178+
outputFile.write(xml)
179+
except:
180+
print "Couldn't write the XML file"
181+
traceback.print_exc()
182+
finally:
183+
outputFile.close()
184+
185+
return
186+
187+
def main():
188+
"""
189+
_main_
190+
191+
Script's main function
192+
"""
193+
# Check if the wmagent config file path exists in the environment
194+
if os.environ.has_key("config"):
195+
os.environ['WMAGENT_CONFIG'] = os.path.join(os.environ.get("config"), 'config.py')
196+
197+
config = setup()
198+
jobCountsByType = countJobs()
199+
availability = calculateAvailability(config, jobCountsByType)
200+
buildSLSXML(config, jobCountsByType, availability)
201+
return 0
202+
203+
if __name__ == "__main__":
204+
sys.exit(main())

bin/cmst0_late_workflows

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import re
1818
import os
1919
import sys
2020

21-
from WMCore.Services.WMStats.WMStatsReader import WMStatsReader
21+
from WMCore.Services.RequestDB.RequestDBReader import RequestDBReader
2222
from WMCore.Configuration import loadConfigurationFile
2323

2424
promptRecoRegexp = re.compile(r'^PromptReco_.*$')
@@ -44,19 +44,26 @@ def loadLimitsFromConfig(config):
4444
limits[workflowType][state] = getattr(workflowTypeInfo, state)
4545
return limits
4646

47-
def processWorkflows(wmStats, workflowLimits):
47+
def processWorkflows(requestDBreader, workflowLimits, statusList):
4848
"""
4949
_processWorkflows_
5050
51-
The core logic of the tool, it gets a WMStatsReader object
51+
The core logic of the tool, it gets a RequestDBReader object
5252
and checks all the available workflows in it. It uses the
5353
workflowLimits to check which workflows have remained in a state
5454
longer than expected, finally it returns a dictionary with the problematic
5555
workflows, their states and the time spent in them.
5656
"""
57-
58-
workflowStates = wmStats.workflowStatus(stale = False)
59-
workflowStates = dict(map(lambda (key, value): (key.replace(' ', ''), value), workflowStates.items()))
57+
workflowStates = {}
58+
workflowInfo = requestDBreader.getRequestByStatus(statusList, detail = True)
59+
for workflow in workflowInfo.keys():
60+
workflowStatus = workflowInfo[workflow]['RequestStatus']
61+
statusTimestamps = workflowInfo[workflow]['RequestTransition']
62+
for stateInfo in statusTimestamps:
63+
if stateInfo['Status'] == workflowStatus:
64+
if workflowStatus not in workflowStates:
65+
workflowStates[workflowStatus] = {}
66+
workflowStates[workflowStatus][workflow] = stateInfo['UpdateTime']
6067
problematicWorkflows = []
6168
currentTime = int(time.time())
6269
for workflowType in workflowLimits:
@@ -129,7 +136,7 @@ def buildSLSXML(outputFilePath, data, runBlacklist, interventionInfo):
129136
intervention = inteventionTemplate.format(**interventionInfo)
130137

131138
template = """<?xml version="1.0" encoding="utf-8"?>
132-
<serviceupdate xmlns="http://sls.cern.ch/SLS/XML/update">
139+
<serviceupdate>
133140
<id>CMST0-late-workflows</id>
134141
<availability>{availability}</availability>
135142
<timestamp>{timestamp}</timestamp>
@@ -154,17 +161,23 @@ def buildSLSXML(outputFilePath, data, runBlacklist, interventionInfo):
154161
return
155162

156163
def main():
157-
164+
#Load configuration files
165+
if os.environ.has_key("config"):
166+
os.environ['WMAGENT_CONFIG'] = os.path.join(os.environ.get("config"), 'config.py')
158167
agentConfig = loadConfigurationFile(os.environ["WMAGENT_CONFIG"])
159-
localWMStatsURL = agentConfig.AnalyticsDataCollector.localWMStatsURL
160-
wmstatsReader = WMStatsReader(couchURL = localWMStatsURL)
161-
168+
162169
alarmConfigPath = os.path.join(os.environ.get("SLS_CONFIG") or
163170
os.environ["T0_ROOT"], 'etc/SLSAlarmsConfig.py')
164171
alarmConfig = loadConfigurationFile(alarmConfigPath)
172+
173+
# Connect to the request db
174+
requestDBurl = agentConfig.AnalyticsDataCollector.centralRequestDBURL
175+
requestDBreader = RequestDBReader(requestDBurl, couchapp = 'T0Request')
165176

177+
statusList = ["new","Closed","Merge","Harvesting","Processing Done","AlcaSkim","completed"]
178+
166179
workflowLimits = loadLimitsFromConfig(alarmConfig)
167-
data = processWorkflows(wmstatsReader, workflowLimits)
180+
data = processWorkflows(requestDBreader, workflowLimits, statusList)
168181

169182
interventionInfo = {}
170183
if hasattr(alarmConfig.cmst0_late_workflows, "Intervention"):

0 commit comments

Comments
 (0)