Validate and delete duplicate records from MSSQL table

I had a situation where Index creation failed and business wants to delete duplicate entries immediately from a specific table. I had to validate the columns which are unique and which had different values. Following is easy script to validate duplicates and delete them.

I had a situation where Index creation failed and business wants to delete duplicate entries immediately from a specific table. I had to validate the columns which are unique and which had different values. Following is easy script to validate duplicates and delete them.

Following SQL can be used to delete duplicate records.

WITH CTE AS(   SELECT *,
       RN = ROW_NUMBER() OVER(PARTITION BY col1, col2, col3 ORDER BY uniqueKey DESC)
   FROM MyTABLE)
DELETE TOP(10000) FROM CTE WHERE RN > 1
-- .

Steps used to validate the data before deletion:
Validate number of duplicate record combinations

;WITH CTE AS (
select  col1, col2, col3, count(*) as mycount
,min(uniqueKey) as min_uniqueuekey, max(uniqueKey) as max_ uniqueKey
from MyTABLE WITH (NOLOCK)
group by col1, col2, col3
having count(*) > 1
)
select count(*) from CTE 
-- Above query indicates duplicate occurrences but not actual number of duplicate records  
-- NOTE: We plan to identify duplicate records based on 3 columns but not all columns of the table. 


-- Verify if the records are unique (pick 2 samples in each combination and check if all other columns are same or not)
;WITH CTE AS (
select  col1, col2, col3, count(*) as mycount
,min(uniqueKey) as min_uniqueKey, max(uniqueKey) as max_uniqueKey
from MyTABLE
group by col1, col2, col3
having count(*) > 1
)
select * from CTE 
join MyTABLE s1 on s1.uniqueKey = CTE.min_uniqueKey
JOIN MyTABLE s2 on s2.uniqueKey = CTE.max_uniqueKey
OR s1.Column	  s2.Column
OR s1.Column3	  s2.Column3
OR s1.Column4	  s2.Column4
OR s1.Column5	  s2.Column5
OR s1.Column6	  s2.Column6
OR s1.Column7	  s2.Column7
OR s1.Column8	  s2.Column8
OR s1.Column9	  s2.Column9
OR s1.Column10	  s2.Column10
OR s1.Column11	  s2.Column11
OR s1.Column12	  s2.Column12
OR s1.Column13	  s2.Column13
OR s1.Column14	  s2.Column14
OR s1.Column15	  s2.Column15
-- OR s1.ModifiedOn	  s2.ModifiedOn (We know this value will be different, exclude from validation) 
-- If all other columns are uniqueue, above query should return 0 rows  


-- Check  maximum number of duplicate records for specific criteria
;WITH CTE AS (
select  [UserID] , [StartDateTime] , [EndDateTime], count(*) as mycount
,min(ScheduleID) as min_scheduleID, max(ScheduleID) as max_scheduleID
from Schedule WITH (NOLOCK) 
group by [UserID] , [StartDateTime] , [EndDateTime]
having count(*) > 1 )
select TOP 10 * from CTE ORDER BY 4 DESC 



-- Delete duplicate records: in multiple iterations:

WITH CTE AS(
   SELECT *,
       RN = ROW_NUMBER() OVER(PARTITION BY col1, col2, col3 ORDER BY uniqueKey DESC)
   FROM MyTABLE )
DELETE TOP(10000) FROM CTE WHERE RN > 1
-- .

Ref: https://vijredblog.wordpress.com/2014/08/21/how-to-identify-and-delete-duplicate-records-in-a-table-sql/

Advertisements
This entry was posted in SQL, SQL Query, Uncategorized and tagged , , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s