Tuesday, January 15, 2019

SQL question challenge (Delete duplicates)



We will be using duplicate employees table.
Delete duplicate rows in sql
SQL Script to create dup_employees table

link to github

Create table dup_employees
(
     id int,
     first_name nvarchar(50),
     last_name nvarchar(50),
     gender nvarchar(50),
     salary int
);

Insert into dup_employees values (1, 'Mark', 'Hastings', 'Male', 60000);
Insert into dup_employees values (1, 'Mark', 'Hastings', 'Male', 60000);
Insert into dup_employees values (1, 'Mark', 'Hastings', 'Male', 60000);
Insert into dup_employees values (2, 'Mary', 'Lambeth', 'Female', 30000);
Insert into dup_employees values (2, 'Mary', 'Lambeth', 'Female', 30000);
Insert into dup_employees values (3, 'Ben', 'Hoskins', 'Male', 70000);
Insert into dup_employees values (3, 'Ben', 'Hoskins', 'Male', 70000);
Insert into dup_employees values (3, 'Ben', 'Hoskins', 'Male', 70000);



The delete query should delete all duplicate rows except one. Delete all duplicate rows except one in sql

 EMPLOYEES
 1) LIST ALL RECORDS IN THE TABLE
              +------+------------+-----------+--------+--------+
  | id   | first_name | last_name | gender | salary |
  +------+------------+-----------+--------+--------+
  |    1 | Mark       | Hastings  | Male   |  60000 |
  |    1 | Mark       | Hastings  | Male   |  60000 |
  |    1 | Mark       | Hastings  | Male   |  60000 |
  |    2 | Mary       | Lambeth   | Female |  30000 |
  |    2 | Mary       | Lambeth   | Female |  30000 |
  |    3 | Ben        | Hoskins   | Male   |  70000 |
  |    3 | Ben        | Hoskins   | Male   |  70000 |
  |    3 | Ben        | Hoskins   | Male   |  70000 |
  +------+------------+-----------+--------+--------+
2) CREATE TEMPORARY TABLE AND ADD RNK COLUMN AND LOAD DATA FROM dup_employees TABLE
create temporary table temp 
   select row_number() over (partition by id order by id) as rnk, 
     id,first_name,last_name,gender,salary 
     from dup_employees;
  mysql> select * from temp;
  +-----+------+------------+-----------+--------+--------+
  | rnk | id   | first_name | last_name | gender | salary |
  +-----+------+------------+-----------+--------+--------+
  |   1 |    1 | Mark       | Hastings  | Male   |  60000 |
  |   2 |    1 | Mark       | Hastings  | Male   |  60000 |
  |   3 |    1 | Mark       | Hastings  | Male   |  60000 |
  |   1 |    2 | Mary       | Lambeth   | Female |  30000 |
  |   2 |    2 | Mary       | Lambeth   | Female |  30000 |
  |   1 |    3 | Ben        | Hoskins   | Male   |  70000 |
  |   2 |    3 | Ben        | Hoskins   | Male   |  70000 |
  |   3 |    3 | Ben        | Hoskins   | Male   |  70000 |
  +-----+------+------------+-----------+--------+--------+

3) DELETE DUPLICATE RECORDS FROM TEMP TABLE (STAGING TABLE)
 delete from temp where rnk > 1;
  select * from temp;
  +-----+------+------------+-----------+--------+--------+
  | rnk | id   | first_name | last_name | gender | salary |
  +-----+------+------------+-----------+--------+--------+
  |   1 |    1 | Mark       | Hastings  | Male   |  60000 |
  |   1 |    2 | Mary       | Lambeth   | Female |  30000 |
  |   1 |    3 | Ben        | Hoskins   | Male   |  70000 |
  +-----+------+------------+-----------+--------+--------+

4) INSERT DISTINCT RECORDS FROM TEMP TABLE TO MAIN TABLE
insert into dup_employees 
   select id,first_name,last_name,gender,salary 
    from temp;
  select * from dup_employees;
  +------+------------+-----------+--------+--------+
  | id   | first_name | last_name | gender | salary |
  +------+------------+-----------+--------+--------+
  |    1 | Mark       | Hastings  | Male   |  60000 |
  |    2 | Mary       | Lambeth   | Female |  30000 | 
  |    3 | Ben        | Hoskins   | Male   |  70000 |
  +------+------------+-----------+--------+--------+

DDL SCRIPTS FOR CONTACT TABLE

CONTACTS
 DROP TABLE IF EXISTS contacts;
 
  CREATE TABLE contacts (
   id INT PRIMARY KEY AUTO_INCREMENT,
   first_name VARCHAR(50) NOT NULL,
   last_name VARCHAR(50) NOT NULL, 
   email VARCHAR(255) NOT NULL
  );
   
  INSERT INTO contacts (first_name,last_name,email) 
  VALUES ('Carine ','Schmitt','carine.schmitt@verizon.net'),
      ('Jean','King','jean.king@me.com'),
      ('Peter','Ferguson','peter.ferguson@google.com'),
      ('Janine ','Labrune','janine.labrune@aol.com'),
      ('Jonas ','Bergulfsen','jonas.bergulfsen@mac.com'),
      ('Janine ','Labrune','janine.labrune@aol.com'),
      ('Susan','Nelson','susan.nelson@comcast.net'),
      ('Zbyszek ','Piestrzeniewicz','zbyszek.piestrzeniewicz@att.net'),
      ('Roland','Keitel','roland.keitel@yahoo.com'),
      ('Julie','Murphy','julie.murphy@yahoo.com'),
      ('Kwai','Lee','kwai.lee@google.com'),
      ('Jean','King','jean.king@me.com'),
      ('Susan','Nelson','susan.nelson@comcast.net'),
      ('Roland','Keitel','roland.keitel@yahoo.com');
 

5) FIND THE DUPLICATE ROWS
select email,count(email) as dup_count 
   from contacts 
   group by email 
   having count(email) >1;
   +--------------------------+-----------+
   | email                    | dup_count |
   +--------------------------+-----------+
   | jean.king@me.com         |         2 |
   | janine.labrune@aol.com   |         2 |
   | susan.nelson@comcast.net |         2 |
   | roland.keitel@yahoo.com  |         2 |
   +--------------------------+-----------+
  * self join based on the common data
    Note: First id would be always smaller. we need to just keep the first id
  select c.id,d.id,c.email 
   from contacts c join contacts d 
   on c.email=d.email 
   where c.id > d.id;
   +----+----+--------------------------+
   | id | id | email                    |
   +----+----+--------------------------+
   |  6 |  4 | janine.labrune@aol.com   |
   | 12 |  2 | jean.king@me.com         |
   | 13 |  7 | susan.nelson@comcast.net |
   | 14 |  9 | roland.keitel@yahoo.com  |
   +----+----+--------------------------+

6)DELETE DUPLICATE ROWS USING DELETE JOIN STATEMENT
delete c 
   from contacts c 
   join contacts d 
   on c.email=d.email 
   where c.id > d.id;
  
  select id,email 
   from contacts;
   +----+---------------------------------+
   | id | email                           |
   +----+---------------------------------+
   |  1 | carine.schmitt@verizon.net      |
   |  2 | jean.king@me.com                |
   |  3 | peter.ferguson@google.com       |
   |  4 | janine.labrune@aol.com          |
   |  5 | jonas.bergulfsen@mac.com        |
   |  7 | susan.nelson@comcast.net        |
   |  8 | zbyszek.piestrzeniewicz@att.net |
   |  9 | roland.keitel@yahoo.com         |
   | 10 | julie.murphy@yahoo.com          |
   | 11 | kwai.lee@google.com             |
   +----+---------------------------------+

7) DELETE DUPLICATE RECORDS USING WINDOW FUNCTIONS
delete from contacts 
where id in (select id 
from (select a.id,a.email from 
(select id,email,row_number() over (partition by email order by id)as rnk from contacts) 
a where a.rnk > 1) temp);
   

No comments:

Post a Comment