***************************************************************
1) WHICH YEAR HAS THE MOST NO OF RATINGS***************************************************************
select year(from_unixtime(rating_time)) rating_year, count(*) as cnt from latest_ratings group by year(from_unixtime(rating_time)) order by rating_year DESC; YEAR RATING_YEAR 2018 1086549 2017 1973721 2016 2077152 2015 1907576 2014 584216 2013 633573 2012 792701 2011 833774 2010 982104 2009 993111 2008 1210356 2007 1095710 2006 1210803 2005 1849719 2004 1201656 2003 1079130 2002 910350 2001 1239432 2000 2033738 1999 1231173 1998 329704 1997 763929 1996 1733263 1995 4
This can be achieved by average ratings
average ratings = Sum of ratings / count (user ID)
Note:
* Same user might have reviewed multiple times. This is the reason user should be distinct
* Usually for one movie we would be asked to rate once
* If everyone is getting average rating as 5.0 then we have a problem. It could be that particular
movies would have been rated less. We might also get every rating as 5. we can have an having
condition where it would be greater than 500
* we need to join with latest movies table to get the movie name
* To get the top rated movie we would be using the ranking function where ranking =1
select a.rating_year, b.title, a.avg_rating from (select movie_id , cast(sum(rating) as double)/count(distinct user_id) avg_rating , year(from_unixtime(rating_time)) rating_year, rank() over (partition by year(from_unixtime(rating_time)) order by cast(sum(rating) as double)/count(distinct user_id) desc) ranking from latest_ratings group by movie_id, year(from_unixtime(rating_time)) having count(distinct user_id) > 500 ) a join latest_movies b on a.movie_id = b.movie_id where a.ranking = 1 order by a.rating_year; RATING_YEAR TITLE AVERAGE_RATING 1996 Schindler's List (1993) 4.514696276943174 1997 Wallace & Gromit: The Wrong Trousers (1993) 4.5112903225806456 1998 Saving Private Ryan (1998) 4.480701754385965 1999 "Godfather 4.544752645124392 2000 "Shawshank Redemption 4.553487376979033 2001 "Shawshank Redemption 4.584341906202723 2002 "Usual Suspects 4.589861751152074 2003 "Shawshank Redemption 4.495418448381185 2004 "Shawshank Redemption 4.3772394136807815 2005 "Shawshank Redemption 4.3538953621201735 2006 "Shawshank Redemption 4.390079103493737 2007 "Shawshank Redemption 4.396220472440945 2008 "Shawshank Redemption 4.435086810054418 2009 "Shawshank Redemption 4.44210722263589 2010 "Shawshank Redemption 4.432236376503893 2011 "Shawshank Redemption 4.45922131147541 2012 "Shawshank Redemption 4.408857509627728 2013 "Shawshank Redemption 4.437730287398674 2014 "Shawshank Redemption 4.3894086826347305 2015 "Shawshank Redemption 4.392230338677545 2016 "Shawshank Redemption 4.3361185383244205 2017 Planet Earth (2006) 4.4364844903988185 2018 "Shawshank Redemption 4.35815356065878
****************************************************************
3) EVERY YEAR HAVING A MOVIE WHICH HAS MAX USERS
*****************************************************************
* Every year for any movie id what is the max users (How many users has rated that movie in the particular year)
* We need every year from 1
* We would match the year rating with movie_id and max users with user count
create table tmp_movie_max_users_yearly as select b.movie_id, max(c.rating_year) myear, b.max_users from (select movie_id, max(user_count) max_users from ( select movie_id , year(from_unixtime(rating_time)) rating_year , count(distinct user_id) user_count from latest_ratings group by movie_id, year(from_unixtime(rating_time))) a group by movie_id ) b join (select movie_id , year(from_unixtime(rating_time)) rating_year , count(distinct user_id) user_count from latest_ratings group by movie_id, year(from_unixtime(rating_time))) c on b.movie_id = c.movie_id and b.max_users = c.user_count group by b.movie_id,b.max_users; MOVIE_ID MYEAR MAX_USERS 193819 2018 1 193821 2018 3 193823 2018 1 193825 2018 1 193827 2018 1 193829 2018 1 193833 2018 1 193835 2018 1 193837 2018 1 193839 2018 1 193841 2018 1 193843 2018 1 193845 2018 1 193849 2018 2 193851 2018 1 193853 2018 1 193855 2018 1 193857 2018 1 193859 2018 1 193861 2018 1
*********************************************************************************4) JOIN THE TABLE WHICH HAS MAX USERS EVERY YEAR FOR A MOVIE WITH TOP RATED MOVIES IN EACH YEAR
*********************************************************************************
select c.rating_year, d.title, c.avg_rating from ( select a.movie_id , cast(sum(a.rating) as double)/count(distinct a.user_id) avg_rating , year(from_unixtime(a.rating_time)) rating_year , rank() over (partition by year(from_unixtime(a.rating_time)) order by cast(sum(a.rating) as double)/count(distinct a.user_id) desc) ranking from latest_ratings a join tmp_movie_max_users_yearly b on a.movie_id=b.movie_id and year(from_unixtime(a.rating_time)) = b.myear group by a.movie_id,year(from_unixtime(a.rating_time)) having count(distinct a.user_id) > 100 ) c join latest_movies d on c.movie_id = d.movie_id where c.ranking = 1 order by c.rating_year; RATING_YEAR TITLE AVERAGE RATING 1996 Schindler's List (1993) 4.514696276943174 1997 Star Wars: Episode IV - A New Hope (1977) 4.268936690794169 1998 Money Talks (1997) 2.9597701149425286 1999 World of Apu, The (Apur Sansar) (1959) 4.510989010989011 2000 Wallace & Gromit: A Close Shave (1995) 4.504720406681191 2001 Sweet Smell of Success (1957) 4.288135593220339 2002 Z (1969) 4.175438596491228 2003 Europa Europa (Hitlerjunge Salomon) (1990) 4.11734693877551 2004 To Have and Have Not (1944) 4.090686274509804 2005 Ikiru (1952) 4.366071428571429 2006 Why We Fight (2005) 4.016981132075472 2007 Hearts of Darkness: A Filmmakers Apocalypse (1991) 4.065934065934066 2008 49 Up (2005) 4.05607476635514 2009 Seven Up! (1964) 4.085987261146497 2010 Decalogue, The (Dekalog) (1989) 4.208333333333333 2011 BURN-E (2008) 3.9066390041493775 2012 The Artist (2011) 3.928813559322034 2013 Game Change (2012) 3.772277227722772 2014 Philomena (2013) 3.853395061728395 2015 Fight Club (1999) 4.290882315277244 2016 The Blue Planet (2001) 4.328313253012048 2017 Planet Earth II (2016) 4.4929292929292926 2018 The Godfather Trilogy: 1972-1990 (1992) 4.343939393939394
********************************************************************************
5)WHICH MOVIE HAS BEEN RATED 50% MORE AFTER 5 YEARS FROM THEIR RELEASE + NEXT YEAR
********************************************************************************
* We need to have the release date
* We need to have 5 years of movie data from thier release date
SD - 2005 : 10000 - 4.2 - (10/12)=.83 (4.2 * .83) =3.4986
Ikuri - 2005 : 6000 - 4.366 - (60/12) =.5 (4.366 * .50) =3.183
*Our target is to have a movie release date
Which we need to get it from
Ex : Toy Story (1995)
In this case it starts after the parenthesis and year is 4 character. so it would be 1995
5.1) select substring(title, 12, 3) from million_movies limit 10
5.2) select instr(title'(') from million_movies limit 10
Note:This might not be the best solution as movie might have parenthesis
Sol: We need to write a function to find parenthesis.Take the length of the string after
If the length of the string is greater than 4 then we need to consider the second
parenthesis
Note:we also need to consider if there is a second bracket involved
5.3) select distinct
case
when
length(substring(title,instr(title,'(')+1,length(title))) > 5
then
substring(substring(title,instr(title,'(')+1,length(title)),'('+1,4)
else
substring(title,instr(title,'(')+1,length(title)),'('+1,4)
end
from million_movies limit 10;
5.4)select substring(title,instr(reverse(title),'('))
from million_movies limit 10;
5.5)select substring(title,length(title) -instr(reverse(title), '('
from million_movies limit 10
5.6) select substring(title,length(title) - instr(reverse(title),
,'('))+2,4) release_year
from million_movies limit 10
5.7) create table latest_items as select distinct movie_id, title, case when instr(reverse(title),'(' ) = 0 then '' when substring(title,instr((title), '(')+1,4) between 1900 and 2020 then substring(title,instr((title), '(')+1,4) when substring(title,instr((title), '(')+1,4) not between '1900' and '2020' and substring(title,length(title) - instr(reverse(title), '(')+2,4) not between '1900' and '2020' then ' ' else substring(title, length(title) - instr(reverse(title), '(')+2,4) end release_year from latest_movies; MOVIE_ID TITLE MOVIE_YEAR 1 Toy Story (1995) 1995 2 Jumanji (1995) 1995 3 Grumpier Old Men (1995) 1995 4 Waiting to Exhale (1995) 1995 5 Father of the Bride Part II (1995) 1995 6 Heat (1995) 1995 7 Sabrina (1995) 1995 8 Tom and Huck (1995) 1995 9 Sudden Death (1995) 1995 10 GoldenEye (1995) 1995 5.8) JOIN LATEST RATINGS AND LATEST ITEMS create table tmp_movies_rel_usrs as select c.movie_id, max(c.user_counts) max_user_count from (select a.movie_id, year(from_unixtime(a.rating_time)) rating_year, count(distinct a.user_id) user_counts from latest_ratings a join (select distinct movie_id, release_year, release_year+ 1 next_years from latest_items where release_year > 1994 ) b on a.movie_id =b.movie_id where (year(from_unixtime(rating_time)) =b.release_year or year(from_unixtime(rating_time)) =b.next_years) group by a.movie_id,year(from_unixtime(a.rating_time)) having user_counts > 500 ) c group by c.movie_id MOVIE_ID MAX_USER_COUNTS 4 1364 8 589 12 861 16 4209 20 1647 24 1755 28 954 32 11035 36 6071 44 3631 5.8) RATED AFTER 5 YEARS OF RELEASE create table tmp_movies_rel_later_usrs as select a.movie_id, year(from_unixtime(a.rating_time)) rating_year, count(distinct a.user_id) user_counts from latest_ratings a join (select distinct movie_id, release_year, release_year+ 5 after5Years from latest_items where release_year > 0 ) b on a.movie_id =b.movie_id where year(from_unixtime(rating_time)) >= b.after5Years group by a.movie_id,year(from_unixtime(a.rating_time)); MOVIE_ID RATING_YEARS COUNTS 1079 1995 1 5060 1996 345 597 1996 12410 541 1996 1440 111 1996 4242 495 1996 139 595 1996 18193 6918 1996 31 587 1996 12001 589 1996 14519
MOVIE WHICH HAS BEEN RATED 50% MORE AFTER 5 YEARS FROM THE RELEASE NOW WE NEED TO CALCULATE THE PERCENTAGE CHANGE FROM THE YEARS
WE NEED TO PULL OUT THE MAX CHANGES ANY YEAR WHEREVER WE FIND THE CHANGES GREATER THAN 100%
select * from (select c.movie_id, d.title, c.rating_year, c.percentage_change, rank () over (partition by c.movie_id order by c.percentage_change desc) ranking from (select a.movie_id, b.rating_year, (cast((user_counts - max_user_count) as double) /max_user_count) * 100 percentage_change from tmp_movies_rel_usrs a join tmp_movies_rel_later_usrs b on a.movie_id = b.movie_id where (cast((user_counts -max_user_count) as DOUBLE)/max_user_count) >=1 ) c join latest_items d on c.movie_id=d.movie_id ) e where e.ranking =1 order by title limit 100; MOVIE_ID TITLE RATING YEAR PERCENTAGE CHANGE RANKING 93510 21 Jump Street (2012) 2017 178.55855855855856 1 7293 50 First Dates (2004) 2016 103.66300366300368 1 4973 Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001) 2015 183.10473815461344 1 2329 American History X (1998) 2015 294.40715883668906 1 1917 Armageddon (1998) 2005 142.296918767507 1 1784 As Good as It Gets (1997) 2005 112.74961597542243 1 1517 Austin Powers: International Man of Mystery (1997) 2005 548.3535528596187 1 72641 Blind Side, The (2009) 2016 164.95132127955495 1 1673 Boogie Nights (1997) 2005 302.0869565217391 1 6373 Bruce Almighty (2003) 2016 115.06024096385543 1 88140 Captain America: The First Avenger (2011) 2016 220.09493670886076 1 1552 Con Air (1997) 2005 232.66331658291458 1 88163 Crazy, Stupid, Love. (2011) 2016 107.19557195571956 1 91529 Dark Knight Rises, The (2012) 2017 156.56565656565658 1 58559 Dark Knight, The (2008) 2015 126.00052314935914 1 79091 Despicable Me (2010) 2016 257.4074074074074 1 45720 Devil Wears Prada, The (2006) 2016 129.08847184986595 1 99114 Django Unchained (2012) 2017 105.44507575757575 1 4878 Donnie Darko (2001) 2016 447.9859894921191 1
No comments:
Post a Comment