Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
302 changes: 207 additions & 95 deletions bin/cmst0_backlog_wma
Original file line number Diff line number Diff line change
@@ -1,100 +1,212 @@
#!/usr/bin/env python

from WMCore.Configuration import loadConfigurationFile
from WMCore.Database.CMSCouch import Database
from string import Template
"""
_cmst0_backlog_wma_

Loof for created jobs by Workflow type in the Tier-0
If the creted jobs overall by workflow type exceeds a configured threshold
then it alarms about it.

Availability metrics are defined as:

0 - There is backlog
100 - There is no backlog
"""

import logging
import threading
import sys
import re
import os
import time

# Load/set configuration :
config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"])
alarmConfigPath = os.path.join(os.environ.get("SLS_CONFIG") or
os.environ["T0_ROOT"], 'etc/SLSAlarmsConfig.py')
alarmConfig = loadConfigurationFile(alarmConfigPath)
xmlFile = getattr(alarmConfig.cmst0_backlog_wma, "xmlFile", None) or "cmst0_backlog_wma.xml"
slsXml = os.path.join(alarmConfig.Settings.xmlDir, xmlFile)

# Getting what we need from configuration
couchURL = config.JobStateMachine.couchurl
jobsDatabase = config.JobStateMachine.couchDBName + "/jobs"

jobsDB = Database(jobsDatabase, couchURL)

# This is the same as (example) :
# http://vocms15.cern.ch:5984/wmagent_jobdump%2Fjobs/_design/JobDump/_view/createdJobsByWorkflowName?group_level=1
rows = jobsDB.loadView("JobDump", "createdJobsByWorkflowName", {"group_level" : 1})

# This will store how much created jobs we have on each type, created means in the system but still not submitted
jobCounts = {
"Express" : 0,
"Repack" : 0,
"PromptReco" : 0
}

for row in rows['rows']:
workflow = row['key'][0]
workflowType = workflow.split("_")[0]
createdJobs = row['value']

jobCounts[workflowType] += createdJobs

# Here comes the TH logic, very simple in the beginning
availability = 100
for workflowType in jobCounts:
limit = getattr(alarmConfig.cmst0_backlog_wma, workflowType)
if jobCounts[workflowType] > limit:
availability = 0

interventionInfo = {}
if hasattr(alarmConfig.cmst0_backlog_wma, "Intervention"):
startTime = alarmConfig.cmst0_backlog_wma.Intervention.startTime
duration = alarmConfig.cmst0_backlog_wma.Intervention.duration
message = alarmConfig.cmst0_backlog_wma.Intervention.message

# Check that the intervention is present or in the future
structStartTime = time.strptime(startTime, "%Y-%m-%dT%H:%M:%S")
startTimeSeconds = time.mktime(structStartTime)
if (startTimeSeconds + duration * 3600) >= time.time():
interventionInfo = {'startTime' : startTime,
'duration' : duration,
'message' : message}
intervention = ""
if interventionInfo:
inteventionTemplate = """ <interventions>
<intervention start="{startTime}" length="PT{duration}H">
{message}
</intervention>
</interventions>"""

intervention = inteventionTemplate.format(**interventionInfo)

timezone = str(int(-time.timezone / 3600)).zfill(2)
timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+")
timestamp += "%s:00" % timezone
xmlValues = {
"availability" : availability,
"timestamp" : timestamp,
"promptreco_count": jobCounts["PromptReco"],
"express_count" : jobCounts["Express"],
"repack_count" : jobCounts["Repack"],
"intervention" : intervention
}


template = Template("""<?xml version="1.0" encoding="utf-8"?>
<serviceupdate xmlns="http://sls.cern.ch/SLS/XML/update">
<id>cmst0-wma-backlog</id>
<availability>$availability</availability>
<timestamp>$timestamp</timestamp>
<data>
<numericvalue name="promptreco_count" desc="Backlogged PromptReco">$promptreco_count</numericvalue>
<numericvalue name="repack_count" desc="Backlogged Repack">$repack_count</numericvalue>
<numericvalue name="express_count" desc="Backlogged Express">$express_count</numericvalue>
</data>
$intervention
</serviceupdate> """)

xmlUpdated = template.safe_substitute(xmlValues)
xml = open(slsXml, 'w')
xml.write(xmlUpdated)
xml.close()
from WMCore.WMInit import connectToDB
from WMCore.Database.DBFormatter import DBFormatter
from WMCore.Configuration import loadConfigurationFile

class CreatedJobsDAO(DBFormatter):
"""
DAO to extract information about running jobs from the database,
it reports information about the time a job has been executing according
to the records in BossAir, it also reports the type of job.
It distinguishes between running and pending jobs.
"""
sql = """SELECT wmbs_workflow.name as workflow, COUNT(*) as jobs
FROM wmbs_job
INNER JOIN wmbs_jobgroup
ON wmbs_job.jobgroup = wmbs_jobgroup.id
INNER JOIN wmbs_subscription
ON wmbs_subscription.id = wmbs_jobgroup.subscription
INNER JOIN wmbs_workflow
ON wmbs_workflow.id = wmbs_subscription.workflow
WHERE wmbs_job.state = (SELECT id FROM wmbs_job_state
WHERE name = 'created')
GROUP BY wmbs_workflow.name
"""

def execute(self, conn = None, transaction = False):
result = self.dbi.processData(self.sql, conn = conn,
transaction = transaction)
results = self.formatDict(result)

return results

def setup():
"""
_setup_

Perform any global setup operations.
Setups the connection to the database and loads the alarm
configuration. It returns the specific alarm configuration.
"""
connectToDB()
configPath = os.path.join(os.environ.get("SLS_CONFIG") or
os.environ["T0_ROOT"], "etc/SLSAlarmsConfig.py")
# Load only the relevant alarm configuration but add anything from Settings
fullConfig = loadConfigurationFile(configPath)
config = getattr(fullConfig, "cmst0_backlog_wma")
settings = getattr(fullConfig, "Settings")
config.section_("Settings")
config.Settings = settings
return config

def countJobs():
"""
_countJobs_

Creates the summary of created jobs in the Tier-0
by workflow Type: Express, Repack and PromptReco
"""
myThread = threading.currentThread()
retrieveInfo = CreatedJobsDAO(logger = logging, dbinterface = myThread.dbi)
jobCountsByWorkflow = retrieveInfo.execute()
jobCounts = {
"Express" : 0,
"Repack" : 0,
"PromptReco" : 0
}
for workflow in jobCountsByWorkflow:
workflowType = workflow['workflow'].split("_")[0]
createdJobs = workflow['jobs']

jobCounts[workflowType] += createdJobs

return jobCounts

def calculateAvailability(config, jobCountsByType):
"""
_calculateAvailability_

Calculate the availability of the service
according to the guidelines defined in the module
documentation
"""
availability = 100
for workflowType in jobCountsByType:
limit = getattr(config, workflowType)
if jobCountsByType[workflowType] > limit:
availability = 0

return availability

def buildSLSXML(config, jobCountsByType, availability):
"""
_buildSLSXML_

Builds an XML file for SLS updates based
on the information in data.
"""
timezone = str(int(-time.timezone / 3600)).zfill(2)
timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+")
timestamp += "%s:00" % timezone

# Retrieve the intervention info if any
interventionInfo = {}
if hasattr(config, "Intervention"):
startTime = config.Intervention.startTime
duration = config.Intervention.duration
message = config.Intervention.message

# Check that the intervention is present or in the future
structStartTime = time.strptime(startTime, "%Y-%m-%dT%H:%M:%S")
startTimeSeconds = time.mktime(structStartTime)
if (startTimeSeconds + duration * 3600) >= time.time():
interventionInfo = {'startTime' : startTime,
'duration' : duration,
'message' : message}

intervention = ""
if interventionInfo:
inteventionTemplate = """ <interventions>
<intervention start="{startTime}" length="PT{duration}H">
{message}
</intervention>
</interventions>"""

intervention = inteventionTemplate.format(**interventionInfo)

jobsInfo = {
"promptreco_count": jobCountsByType["PromptReco"],
"express_count" : jobCountsByType["Express"],
"repack_count" : jobCountsByType["Repack"],
}

dataTemplate = """
<numericvalue name="promptreco_count" desc="Backlogged PromptReco">{promptreco_count}</numericvalue>
<numericvalue name="repack_count" desc="Backlogged Repack">{repack_count}</numericvalue>
<numericvalue name="express_count" desc="Backlogged Express">{express_count}</numericvalue>"""
data = dataTemplate.format(**jobsInfo)

template = """<?xml version="1.0" encoding="utf-8"?>
<serviceupdate>
<id>CMST0-wma-backlog</id>
<availability>{availability}</availability>
<timestamp>{timestamp}</timestamp>
<data>
{data}
</data>
{intervention}
</serviceupdate>\n"""

xml = template.format(data = data, availability = availability,
timestamp = timestamp, intervention = intervention)

# Get the output file path
xmlFile = getattr(config, "xmlFile", "cmst0_backlog_wma.xml")
try:
outputFile = open(os.path.join(config.Settings.xmlDir, xmlFile), 'w')
outputFile.write(xml)
except:
print "Couldn't write the XML file"
traceback.print_exc()
finally:
outputFile.close()

return

def main():
"""
_main_

Script's main function
"""
try:
# Check if the wmagent config file path exists in the environment
if os.environ.has_key("config"):
os.environ['WMAGENT_CONFIG'] = os.path.join(os.environ.get("config"), 'config.py')

config = setup()
jobCountsByType = countJobs()
availability = calculateAvailability(config, jobCountsByType)
buildSLSXML(config, jobCountsByType, availability)
return 0

except Exception, e:
timezone = str(int(-time.timezone / 3600)).zfill(2)
timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+")
timestamp += "%s:00" % timezone
sys.stderr.write('\n'+str(timestamp)+'\n')
raise e

if __name__ == "__main__":
sys.exit(main())
Loading