MILLIONS DATASET (MOVIES, RATINGS, USER)
RENAME FILES
cd million
mv movies.dat movies
mv ratings.dat ratings
mv users.dat users
RATINGS
RATINGS FILE DESCRIPTION ================================================================================ All ratings are contained in the file "ratings.dat" and are in the following format: UserID::MovieID::Rating::Timestamp - UserIDs range between 1 and 6040 - MovieIDs range between 1 and 3952 - Ratings are made on a 5-star scale (whole-star ratings only) - Timestamp is represented in seconds since the epoch as returned by time(2) - Each user has at least 20 ratings
sed 's/::/@/g' /home/hadoop/million/ratings > /home/hadoop/million/ratings_clean
CREATE DIRECTORY FOR RATINGS
hdfs dfs -mkdir /hackerday_ratings/million/ratings
MOVE RATINGS DATASET TO HDFS
hdfs dfs -copyFromLocal /home/hadoop/million/ratings_clean /hackerday_ratings/million/ratingsckerday_ratings/million/ratings
VALIDATE DATA
hdfs dfs -cat /hackerday_ratings/million/ratings/ratings_clean | head -n 10
CREATE EXTERNAL TABLE FOR MOVIE RATINGS
use hackerday_ratings; drop table million_ratings; create external table million_ratings ( user_id int, movie_id int, rating double, rating_time bigint ) row format delimited fields terminated by '@' lines terminated by '\n' location '/hackerday_ratings/million/ratings'; SELECT * FROM million_ratings LIMIT 10
USERS
FIND AND REPLACE DELIMITERUSERS FILE DESCRIPTION ================================================================================ User information is in the file "users.dat" and is in the following format: UserID::Gender::Age::Occupation::Zip-code All demographic information is provided voluntarily by the users and is not checked for accuracy. Only users who have provided some demographic information are included in this data set. - Gender is denoted by a "M" for male and "F" for female - Age is chosen from the following ranges: * 1: "Under 18" * 18: "18-24" * 25: "25-34" * 35: "35-44" * 45: "45-49" * 50: "50-55" * 56: "56+" - Occupation is chosen from the following choices: * 0: "other" or not specified * 1: "academic/educator" * 2: "artist" * 3: "clerical/admin" * 4: "college/grad student" * 5: "customer service" * 6: "doctor/health care" * 7: "executive/managerial" * 8: "farmer" * 9: "homemaker" * 10: "K-12 student" * 11: "lawyer" * 12: "programmer" * 13: "retired" * 14: "sales/marketing" * 15: "scientist" * 16: "self-employed" * 17: "technician/engineer" * 18: "tradesman/craftsman" * 19: "unemployed" * 20: "writer"
sed 's/::/@/g' /home/hadoop/million/users > /home/hadoop/million/users_clean
CREATE USERS DIRECTORY IN HDFS
hdfs dfs -mkdir /hackerday_ratings/million/users
COPY USERS DATASET TO HDFS
hdfs dfs -copyFromLocal /home/hadoop/million/users_clean /hackerday_ratings/million/users
CREATE EXTERNAL TABLE FOR USERS
drop table if exists million_users; create external table million_users ( user_id int, gender char(1), age tinyint, occupation varchar(20), zip_code varchar(10) ) row format delimited fields terminated by '@' lines terminated by '\n' location '/hackerday_ratings/million/users'; SELECT * FROM million_users LIMIT 10
MOVIES
FIND AND REPLACE DELIMITER
CREATE MOVIES DIRECTORY IN HDFS
COPY MOVIES DATASET TO HDFS
CREATE EXTERNAL TABLE FOR MOVIES
sed 's/::/@/g' /home/hadoop/million/movies > /home/hadoop/million/movies_clean
hdfs dfs -mkdir /hackerday_ratings/million/movies
COPY MOVIES DATASET TO HDFS
hdfs dfs -copyFromLocal /home/hadoop/million/movies_clean /hackerday_ratings/million/movies
CREATE EXTERNAL TABLE FOR MOVIES
drop table if exists million_movies;
create external table million_movies(
movie_id int,
title varchar(200),
genre varchar(100)
)
row format delimited
fields terminated by '@'
lines terminated by '\n'
location '/hackerday_ratings/million/movies';
SELECT * FROM million_movies LIMIT 10;
drop table if exists million_movies; create external table million_movies( movie_id int, title varchar(200), genre varchar(100) ) row format delimited fields terminated by '@' lines terminated by '\n' location '/hackerday_ratings/million/movies'; SELECT * FROM million_movies LIMIT 10;
LATEST
Ratings Data File Structure (ratings.csv)
All ratings are contained in the fileratings.csv
. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:userId,movieId,rating,timestamp
The lines within this file are ordered first by userId, then, within user, by movieId.Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
LATEST DATASET
CREATE DIRECTORY
hdfs dfs -mkdir /hackerday_ratings/latest/ratings
hdfs dfs -mkdir /hackerday_ratings/latest/links
hdfs dfs -mkdir /hackerday_ratings/latest/movies
hdfs dfs -mkdir /hackerday_ratings/latest/genome-tags
hdfs dfs -mkdir /hackerday_ratings/latest/genome-scores
hdfs dfs -mkdir /hackerday_ratings/latest/links
hdfs dfs -mkdir /hackerday_ratings/latest/movies
hdfs dfs -mkdir /hackerday_ratings/latest/genome-tags
hdfs dfs -mkdir /hackerday_ratings/latest/genome-scores
COPY FROM LOCAL TO HDFS
hdfs dfs -copyFromLocal /home/hadoop/latest/ratings.csv /hackerday_ratings/latest/ratings hdfs dfs -copyFromLocal /home/hadoop/latest/links.csv /hackerday_ratings/latest/links hdfs dfs -copyFromLocal /home/hadoop/latest/movies.csv /hackerday_ratings/latest/movies hdfs dfs -copyFromLocal /home/hadoop/latest/tags.csv /hackerday_ratings/latest/tags
CREATE EXTERNAL TABLE
drop table if exists latest_ratings; create external table latest_ratings ( user_id int, movie_id int, rating double, rating_time bigint ) row format delimited fields terminated by ',' lines terminated by '\n' location '/hackerday_ratings/latest/ratings' TBLPROPERTIES ("skip.header.line.count" = "1"); drop table if exists latest_movies; create external table latest_movies( movie_id int, title varchar(200), genre varchar(200) ) row format delimited fields terminated by ',' lines terminated by '\n' location '/hackerday_ratings/latest/movies' TBLPROPERTIES ("skip.header.line.count" = "1"); drop table if exists latest_links; create external table latest_links( movie_id int, imdbid varchar(10), tmbid int ) row format delimited fields terminated by ',' lines terminated by '\n' location '/hackerday_ratings/latest/links' TBLPROPERTIES ("skip.header.line.count" = "1"); drop table if exists latest_tags; create external table latest_tags( user_id int, movie_id int, tag varchar(500), rating_time bigint ) row format delimited fields terminated by ',' lines terminated by '\n' location '/hackerday_ratings/latest/tags' TBLPROPERTIES ("skip.header.line.count" = "1");
There would be problem while inserting the csv data. Hive does,nt provide an option to have quoted values. There could be possibility of a movie which would have "," inside select * from latest_movies where genre like "%19%"
SerDe Overview
SerDe is short for Serializer/Deserializer. Hive uses the SerDe interface for IO. The interface handles both serialization and deserialization and also interpreting the results of serialization as individual fields for processing.
A SerDe allows Hive to read in data from a table, and write it back out to HDFS in any custom format. Anyone can write their own SerDe for their own data formats.
CREATE EXTERNAL TABLE SPECIFYING SERDEPROPERTIES
drop table if exists latest_ratings_serde; create external table latest_ratings_serde( user_id string, movie_id string, rating string, rating_time string ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES( "seperatorChar"= "\,", "quoteChar" = "\"" ) location '/hackerday_ratings/latest/ratings' TBLPROPERTIES ("skip.header.line.count" = "1");
drop table if exists latest_movies_serde; create external table latest_movies_serde( movie_id string, title string, genre string ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES( "seperatorChar"= "\,", "quoteChar" = "\"" ) location '/hackerday_ratings/latest/movies' TBLPROPERTIES ("skip.header.line.count" = "1");
drop table if exists latest_links_serde; create external table latest_links_serde( movie_id string, imdbid string, tmbid string ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES( "seperatorChar"= "\,", "quoteChar" = "\"" ) location '/hackerday_ratings/latest/links' TBLPROPERTIES ("skip.header.line.count" = "1"); drop table if exists latest_tags; create external table latest_tags_serde( user_id string, movie_id string, tag string, rating_time string ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES( "seperatorChar"= "\,", "quoteChar" = "\"" ) location '/hackerday_ratings/latest/tags' TBLPROPERTIES ("skip.header.line.count" = "1");
We need to have only string ot big int . We cannot have varchar or char data type
RECREATING TABLES WITH APPROPRIATE DATA TYPES FOR ANALYSIS
drop table if exists latest_ratings; create table latest_ratings as select cast (user_id as int) user_id, cast (movie_id as int) movie_id, cast (rating as double) rating , cast (rating_time as bigint) rating_time from latest_ratings_serde; --LATEST MOVIE TABLE drop table if exists latest_movies; create table latest_movies as select cast(movie_id as int) movie_id, cast(title as varchar(200)) title, cast(genre as varchar(200)) genre from latest_movies_serde --LATEST LINK TABLE drop table if exists latest_links; create table latest_links as select cast (movie_id as int) as movie_id, cast (imdbid as varchar(10)) as imdbid, cast (tmbid as int) as tmbid from latest_links_serde;
No comments:
Post a Comment