Tuesday, July 17, 2018

Movie ratings project part 2 (Data ingestion)


cont  from movie recommendation part 1

link to github

MILLIONS DATASET (MOVIES, RATINGS, USER)

RENAME FILES
cd million
mv movies.dat movies
mv ratings.dat ratings
mv users.dat users

RATINGS

RATINGS FILE DESCRIPTION
================================================================================

All ratings are contained in the file "ratings.dat" and are in the
following format:

UserID::MovieID::Rating::Timestamp

- UserIDs range between 1 and 6040 
- MovieIDs range between 1 and 3952
- Ratings are made on a 5-star scale (whole-star ratings only)
- Timestamp is represented in seconds since the epoch as returned by time(2)
- Each user has at least 20 ratings

REPLACE DELIMITER "::" WITH "@"
Note: we cannot use "::" delimiter as it would take extra column .


FIND AND REPLACE DELIMITER
sed 's/::/@/g' /home/hadoop/million/ratings > /home/hadoop/million/ratings_clean 

CREATE DIRECTORY FOR RATINGS
hdfs dfs -mkdir /hackerday_ratings/million/ratings


MOVE RATINGS DATASET TO HDFS
hdfs dfs -copyFromLocal /home/hadoop/million/ratings_clean /hackerday_ratings/million/ratingsckerday_ratings/million/ratings

VALIDATE DATA
 hdfs dfs -cat /hackerday_ratings/million/ratings/ratings_clean | head -n 10

CREATE EXTERNAL TABLE FOR MOVIE RATINGS
use hackerday_ratings;
drop table million_ratings;
create external table million_ratings (
user_id int,
movie_id int,
rating double,
rating_time bigint
)
row format delimited
fields terminated by '@'
lines terminated by '\n'
location '/hackerday_ratings/million/ratings';

SELECT * FROM  million_ratings LIMIT 10

USERS

USERS FILE DESCRIPTION
================================================================================

User information is in the file "users.dat" and is in the following
format:

UserID::Gender::Age::Occupation::Zip-code

All demographic information is provided voluntarily by the users and is
not checked for accuracy.  Only users who have provided some demographic
information are included in this data set.

- Gender is denoted by a "M" for male and "F" for female
- Age is chosen from the following ranges:

 *  1:  "Under 18"
 * 18:  "18-24"
 * 25:  "25-34"
 * 35:  "35-44"
 * 45:  "45-49"
 * 50:  "50-55"
 * 56:  "56+"

- Occupation is chosen from the following choices:

 *  0:  "other" or not specified
 *  1:  "academic/educator"
 *  2:  "artist"
 *  3:  "clerical/admin"
 *  4:  "college/grad student"
 *  5:  "customer service"
 *  6:  "doctor/health care"
 *  7:  "executive/managerial"
 *  8:  "farmer"
 *  9:  "homemaker"
 * 10:  "K-12 student"
 * 11:  "lawyer"
 * 12:  "programmer"
 * 13:  "retired"
 * 14:  "sales/marketing"
 * 15:  "scientist"
 * 16:  "self-employed"
 * 17:  "technician/engineer"
 * 18:  "tradesman/craftsman"
 * 19:  "unemployed"
 * 20:  "writer"
FIND AND REPLACE DELIMITER
sed 's/::/@/g' /home/hadoop/million/users > /home/hadoop/million/users_clean 

CREATE USERS DIRECTORY IN HDFS
hdfs dfs -mkdir /hackerday_ratings/million/users

COPY USERS DATASET TO HDFS
hdfs dfs -copyFromLocal /home/hadoop/million/users_clean /hackerday_ratings/million/users

CREATE EXTERNAL TABLE FOR USERS
drop table if exists million_users;
create external table million_users (
user_id int,
gender char(1),
age tinyint,
occupation varchar(20),
zip_code varchar(10)
)
row format delimited
fields terminated by '@'
lines terminated by '\n'
location '/hackerday_ratings/million/users';

SELECT * FROM  million_users LIMIT 10

MOVIES

FIND AND REPLACE DELIMITER
sed 's/::/@/g' /home/hadoop/million/movies > /home/hadoop/million/movies_clean 
CREATE MOVIES DIRECTORY IN HDFS
hdfs dfs -mkdir /hackerday_ratings/million/movies

COPY MOVIES DATASET TO HDFS
 hdfs dfs -copyFromLocal /home/hadoop/million/movies_clean /hackerday_ratings/million/movies


CREATE EXTERNAL TABLE FOR MOVIES

drop table if exists million_movies;
create external table million_movies(
movie_id int,
title varchar(200),
genre varchar(100)
)
row format delimited
fields terminated by '@'
lines terminated by '\n'
location '/hackerday_ratings/million/movies';

SELECT * FROM  million_movies LIMIT 10;

LATEST 

Ratings Data File Structure (ratings.csv)

All ratings are contained in the file ratings.csv. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:
userId,movieId,rating,timestamp
The lines within this file are ordered first by userId, then, within user, by movieId.
Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.

LATEST DATASET

CREATE DIRECTORY 
hdfs dfs -mkdir /hackerday_ratings/latest/ratings
hdfs dfs -mkdir /hackerday_ratings/latest/links
hdfs dfs -mkdir /hackerday_ratings/latest/movies
hdfs dfs -mkdir /hackerday_ratings/latest/genome-tags
hdfs dfs -mkdir /hackerday_ratings/latest/genome-scores

COPY FROM LOCAL TO HDFS
hdfs dfs -copyFromLocal /home/hadoop/latest/ratings.csv /hackerday_ratings/latest/ratings
hdfs dfs -copyFromLocal /home/hadoop/latest/links.csv /hackerday_ratings/latest/links
hdfs dfs -copyFromLocal /home/hadoop/latest/movies.csv /hackerday_ratings/latest/movies
hdfs dfs -copyFromLocal /home/hadoop/latest/tags.csv /hackerday_ratings/latest/tags

CREATE EXTERNAL TABLE
drop table if exists latest_ratings;
create external table latest_ratings (
user_id int,
movie_id int,
rating double,
rating_time bigint
)
row format delimited
fields terminated by ','
lines terminated by '\n'
location '/hackerday_ratings/latest/ratings'
TBLPROPERTIES ("skip.header.line.count" = "1");

drop table if exists latest_movies;
create external table latest_movies(
movie_id int,
title varchar(200),
genre varchar(200)
)
row format delimited
fields terminated by ','
lines terminated by '\n'
location '/hackerday_ratings/latest/movies'
TBLPROPERTIES ("skip.header.line.count" = "1");

drop table if exists latest_links;
create external table latest_links(
movie_id int,
imdbid varchar(10),
tmbid int
)
row format delimited
fields terminated by ','
lines terminated by '\n'
location '/hackerday_ratings/latest/links'
TBLPROPERTIES ("skip.header.line.count" = "1");

drop table if exists latest_tags;
create external table latest_tags(
user_id int,
movie_id int,
tag varchar(500),
rating_time bigint
)
row format delimited
fields terminated by ','
lines terminated by '\n'
location '/hackerday_ratings/latest/tags'
TBLPROPERTIES ("skip.header.line.count" = "1");

There would be problem while inserting the csv data. Hive does,nt provide an option to have quoted values. There could be possibility of a movie which would have "," inside select * from latest_movies where genre like "%19%"
SerDe Overview
SerDe is short for Serializer/Deserializer. Hive uses the SerDe interface for IO. The interface handles both serialization and deserialization and also interpreting the results of serialization as individual fields for processing.
A SerDe allows Hive to read in data from a table, and write it back out to HDFS in any custom format. Anyone can write their own SerDe for their own data formats.
CREATE EXTERNAL TABLE SPECIFYING SERDEPROPERTIES

drop table if exists latest_ratings_serde;
create external table latest_ratings_serde(
user_id string,
movie_id string,
rating string,
rating_time string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES(
"seperatorChar"= "\,",
"quoteChar" = "\""
)
location '/hackerday_ratings/latest/ratings'
TBLPROPERTIES ("skip.header.line.count" = "1");

drop table if exists latest_movies_serde;
create external table latest_movies_serde(
movie_id string,
title string,
genre string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES(
"seperatorChar"= "\,",
"quoteChar" = "\""
)
location '/hackerday_ratings/latest/movies'
TBLPROPERTIES ("skip.header.line.count" = "1");


drop table if exists latest_links_serde;
create external table latest_links_serde(
movie_id string,
imdbid string,
tmbid string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES(
"seperatorChar"= "\,",
"quoteChar" = "\""
)
location '/hackerday_ratings/latest/links'
TBLPROPERTIES ("skip.header.line.count" = "1");

drop table if exists latest_tags;
create external table latest_tags_serde(
user_id string,
movie_id string,
tag string,
rating_time string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES(
"seperatorChar"= "\,",
"quoteChar" = "\""
)
location '/hackerday_ratings/latest/tags'
TBLPROPERTIES ("skip.header.line.count" = "1");
LIMITATIONS OF SERDE 
We need to have only string ot big int . We cannot have varchar or char data type

RECREATING TABLES WITH APPROPRIATE DATA TYPES FOR ANALYSIS
drop table if exists latest_ratings;
create table latest_ratings as 
select cast (user_id as int) user_id, 
cast (movie_id as int) movie_id, 
cast (rating as double) rating ,  
cast (rating_time as bigint) rating_time
from latest_ratings_serde;

--LATEST MOVIE TABLE
drop table if exists latest_movies;
create table latest_movies as
select cast(movie_id as int) movie_id,
cast(title as varchar(200)) title,
cast(genre as varchar(200)) genre
from latest_movies_serde

--LATEST LINK TABLE
drop table if exists latest_links;
create table latest_links as
select cast (movie_id  as int) as movie_id,
cast (imdbid as varchar(10)) as imdbid,
cast (tmbid as int) as tmbid
from latest_links_serde;


No comments:

Post a Comment