Long running job monitor – Alert

I have created this script which will monitor long running jobs and alerts the analyst. One advantage of this custom script is to find threshold dynamically using job history!

CREATE PROCEDURE [dbo].[sp_dba_LongRunningJob_Alert](
	@EMailAlertAddress varchar(256) = 'DEFAULT.MAIL@DOMAIN.COM',
	@ThreshholdFactor float = 1.75,
	@ThreshholdDeltaSec int = 300
	)
/*  
	--Description
 --     This procedure will validate long running SQL jobs and send a mail

 --   Input parameters
	--	@EMailAlertAddress	-	Mail Address
	--	@ThreshholdFactor	-	Multiply factor to average time taken to complete the job
	--	@ThreshholdDeltaSec int -	Add this time to threshhold to avoid false alerts from jobs which runs for a short duration

	--Example
	--	exec sp_dba_LongRunningJob_Alert 'myalias@domain.com', 1.75, 300

 --   Additional Comments
	--	Threshold is not fixed in this monitor. We are calculating the threshhold using job history and a factor. I execute this procedure once every hour in my environment to identify long running jobs
    
 --   Author
	--	Vijred - 

 --   Created
	--	2014-04-15

	--Modifications
	--	None
*/
as
	declare @Subject as varchar(128)
	declare @msg as varchar(4096)
	declare @StartTime as datetime
	declare @UpTimeSec as bigint
	declare @UpTimeFormatted as varchar(128)
	set nocount on

	--Determine Server Uptime
	select @StartTime = login_time from master.sys.dm_exec_sessions where session_id = 1
	set @UpTimeSec = datediff(ss, @StartTime, getdate())
	set @UpTimeFormatted = cast ((@UpTimeSec / 86400) as varchar(8)) + ' day(s) ' +
		cast((@UpTimeSec % 86400 / 3600) as varchar(32)) + ' hour(s) ' + 
		cast((@UpTimeSec % 3600 / 60) as varchar(32)) + ' min(s) ' + 
		cast((@UpTimeSec % 60) as varchar(32)) + ' sec(s)'

	--Set all variables in the message
	set @msg = 'Server Name: ' + @@SERVERNAME + char(13) + char(13) +
				'Server Uptime: ' + cast(@UpTimeFormatted as varchar) + char(13) + char(13) +
				'----------------------------------------------------------------' + char(13) + char(13)


	if exists (select * from tempdb.sys.objects where name = '##Jobs' and type in('U'))
		begin
			drop table ##Jobs
		end
	select sj.job_id, JobName = sj.name, StartDate = sja.start_execution_date, EndDate = sja.stop_execution_date,
		status = case
			when isnull(sjh.run_status, -1) = -1 and sja.start_execution_date is null and sja.stop_execution_date is null then 'Idle'
			when isnull(sjh.run_status, -1) = -1 and sja.start_execution_date is not null and sja.stop_execution_date is null then 'Running'
			when isnull(sjh.run_status, -1) = 0 then 'Failed'
			when isnull(sjh.run_status, -1) = 1 then 'Succeeded'
			when isnull(sjh.run_status, -1) = 2 then 'Retry'
			when isnull(sjh.run_status, -1) = 3 then 'Canceled'
		end,
		RunTime = datediff(SS, sja.start_execution_date, getdate())
		into ##Jobs
		from msdb.dbo.sysjobs sj
			join msdb.dbo.sysjobactivity sja on sj.job_id = sja.job_id
			join (select MaxSessionid = max(session_id) from msdb.dbo.syssessions) ss on ss.MaxSessionid = sja.session_id
			left join msdb.dbo.sysjobhistory sjh on sjh.instance_id = sja.job_history_id

	if exists (select * from tempdb.sys.objects where name = '##Job_threshold' and type in('U'))
		begin
			drop table ##Job_threshold
		end
	select jh.job_id, j.name, count(*) as mycount, avg(run_duration)*@ThreshholdFactor+@ThreshholdDeltaSec as targetduration, max(run_duration) as maxduration
	--select jh.job_id, j.name, count(*) as mycount, (avg(run_duration)-avg(run_duration)) as targetduration, max(run_duration) as maxduration -- comment above line and uncomment this line to test
	into ##Job_threshold
	from msdb.dbo.sysjobhistory jh
		JOIN msdb.dbo.sysjobs j ON jh.job_id = j.job_id
		where jh.run_status = 1
			and jh.step_id = 0
			and j.enabled = 1 
		group by jh.job_id, j.name

	--select * from ##Job_threshold 
	--select * from ##Jobs

	select * from ##Job_threshold jt
	join ##Jobs j on j.job_id = jt.job_id
	where j.status in ('Retry', 'Running')
	and j.RunTime > jt.targetduration

	if (@@ROWCOUNT > 0)
	begin
		set @Subject = 'Long Running Job(s) on ' + @@servername
		set @msg = @msg + 'The following jobs have been running for more than expected time ' + char(13) + char(13)
		select @msg = @msg + Name + ' ---> Running For ' + cast(RunTime / 60 as varchar) + ' Minutes  --> Since ' + cast(StartDate as varchar) + char(13) 
			from ##Job_threshold jt
			join ##Jobs j on j.job_id = jt.job_id
			where j.status in ('Retry', 'Running')
			and j.RunTime > jt.targetduration

		--Email the SQL Group a list of jobs running over nn hours
		exec msdb..sp_send_dbmail
			@recipients = @EMailAlertAddress,
			@body = @msg,
			@subject = @Subject
	end
	drop table ##Job_threshold
	drop table ##Jobs

	SET nocount off 
GO

– Vijred

This entry was posted in Productivity, Script, SQL, SQL Query and tagged , , , , , , . Bookmark the permalink.

Leave a comment