We will be using duplicate employees table.
Delete duplicate rows in sql
SQL Script to create dup_employees table
link to github
Create table dup_employees ( id int, first_name nvarchar(50), last_name nvarchar(50), gender nvarchar(50), salary int ); Insert into dup_employees values (1, 'Mark', 'Hastings', 'Male', 60000); Insert into dup_employees values (1, 'Mark', 'Hastings', 'Male', 60000); Insert into dup_employees values (1, 'Mark', 'Hastings', 'Male', 60000); Insert into dup_employees values (2, 'Mary', 'Lambeth', 'Female', 30000); Insert into dup_employees values (2, 'Mary', 'Lambeth', 'Female', 30000); Insert into dup_employees values (3, 'Ben', 'Hoskins', 'Male', 70000); Insert into dup_employees values (3, 'Ben', 'Hoskins', 'Male', 70000); Insert into dup_employees values (3, 'Ben', 'Hoskins', 'Male', 70000);
The delete query should delete all duplicate rows except one. Delete all duplicate rows except one in sql
EMPLOYEES
1) LIST ALL RECORDS IN THE TABLE
+------+------------+-----------+--------+--------+ | id | first_name | last_name | gender | salary | +------+------------+-----------+--------+--------+ | 1 | Mark | Hastings | Male | 60000 | | 1 | Mark | Hastings | Male | 60000 | | 1 | Mark | Hastings | Male | 60000 | | 2 | Mary | Lambeth | Female | 30000 | | 2 | Mary | Lambeth | Female | 30000 | | 3 | Ben | Hoskins | Male | 70000 | | 3 | Ben | Hoskins | Male | 70000 | | 3 | Ben | Hoskins | Male | 70000 | +------+------------+-----------+--------+--------+
create temporary table temp select row_number() over (partition by id order by id) as rnk, id,first_name,last_name,gender,salary from dup_employees; mysql> select * from temp; +-----+------+------------+-----------+--------+--------+ | rnk | id | first_name | last_name | gender | salary | +-----+------+------------+-----------+--------+--------+ | 1 | 1 | Mark | Hastings | Male | 60000 | | 2 | 1 | Mark | Hastings | Male | 60000 | | 3 | 1 | Mark | Hastings | Male | 60000 | | 1 | 2 | Mary | Lambeth | Female | 30000 | | 2 | 2 | Mary | Lambeth | Female | 30000 | | 1 | 3 | Ben | Hoskins | Male | 70000 | | 2 | 3 | Ben | Hoskins | Male | 70000 | | 3 | 3 | Ben | Hoskins | Male | 70000 | +-----+------+------------+-----------+--------+--------+
3) DELETE DUPLICATE RECORDS FROM TEMP TABLE (STAGING TABLE)
delete from temp where rnk > 1; select * from temp; +-----+------+------------+-----------+--------+--------+ | rnk | id | first_name | last_name | gender | salary | +-----+------+------------+-----------+--------+--------+ | 1 | 1 | Mark | Hastings | Male | 60000 | | 1 | 2 | Mary | Lambeth | Female | 30000 | | 1 | 3 | Ben | Hoskins | Male | 70000 | +-----+------+------------+-----------+--------+--------+
4) INSERT DISTINCT RECORDS FROM TEMP TABLE TO MAIN TABLE
insert into dup_employees select id,first_name,last_name,gender,salary from temp; select * from dup_employees; +------+------------+-----------+--------+--------+ | id | first_name | last_name | gender | salary | +------+------------+-----------+--------+--------+ | 1 | Mark | Hastings | Male | 60000 | | 2 | Mary | Lambeth | Female | 30000 | | 3 | Ben | Hoskins | Male | 70000 | +------+------------+-----------+--------+--------+
DDL SCRIPTS FOR CONTACT TABLE
CONTACTS DROP TABLE IF EXISTS contacts; CREATE TABLE contacts ( id INT PRIMARY KEY AUTO_INCREMENT, first_name VARCHAR(50) NOT NULL, last_name VARCHAR(50) NOT NULL, email VARCHAR(255) NOT NULL ); INSERT INTO contacts (first_name,last_name,email) VALUES ('Carine ','Schmitt','carine.schmitt@verizon.net'), ('Jean','King','jean.king@me.com'), ('Peter','Ferguson','peter.ferguson@google.com'), ('Janine ','Labrune','janine.labrune@aol.com'), ('Jonas ','Bergulfsen','jonas.bergulfsen@mac.com'), ('Janine ','Labrune','janine.labrune@aol.com'), ('Susan','Nelson','susan.nelson@comcast.net'), ('Zbyszek ','Piestrzeniewicz','zbyszek.piestrzeniewicz@att.net'), ('Roland','Keitel','roland.keitel@yahoo.com'), ('Julie','Murphy','julie.murphy@yahoo.com'), ('Kwai','Lee','kwai.lee@google.com'), ('Jean','King','jean.king@me.com'), ('Susan','Nelson','susan.nelson@comcast.net'), ('Roland','Keitel','roland.keitel@yahoo.com');
5) FIND THE DUPLICATE ROWS
select email,count(email) as dup_count from contacts group by email having count(email) >1; +--------------------------+-----------+ | email | dup_count | +--------------------------+-----------+ | jean.king@me.com | 2 | | janine.labrune@aol.com | 2 | | susan.nelson@comcast.net | 2 | | roland.keitel@yahoo.com | 2 | +--------------------------+-----------+ * self join based on the common data Note: First id would be always smaller. we need to just keep the first id select c.id,d.id,c.email from contacts c join contacts d on c.email=d.email where c.id > d.id; +----+----+--------------------------+ | id | id | email | +----+----+--------------------------+ | 6 | 4 | janine.labrune@aol.com | | 12 | 2 | jean.king@me.com | | 13 | 7 | susan.nelson@comcast.net | | 14 | 9 | roland.keitel@yahoo.com | +----+----+--------------------------+
6)DELETE DUPLICATE ROWS USING DELETE JOIN STATEMENT
delete c from contacts c join contacts d on c.email=d.email where c.id > d.id; select id,email from contacts; +----+---------------------------------+ | id | email | +----+---------------------------------+ | 1 | carine.schmitt@verizon.net | | 2 | jean.king@me.com | | 3 | peter.ferguson@google.com | | 4 | janine.labrune@aol.com | | 5 | jonas.bergulfsen@mac.com | | 7 | susan.nelson@comcast.net | | 8 | zbyszek.piestrzeniewicz@att.net | | 9 | roland.keitel@yahoo.com | | 10 | julie.murphy@yahoo.com | | 11 | kwai.lee@google.com | +----+---------------------------------+
7) DELETE DUPLICATE RECORDS USING WINDOW FUNCTIONS
delete from contacts where id in (select id from (select a.id,a.email from (select id,email,row_number() over (partition by email order by id)as rnk from contacts)
a where a.rnk > 1) temp);
No comments:
Post a Comment