Tuesday, July 17, 2018

Movie ratings project part 1 (data ingestion)


link to github

ADD USER
sudo useradd hduser

CREATE DIRECTORY
hdfs dfs -mkdir /hackerday_ratings

LIST CREATED DIRECTORY
hdfs dfs -ls /

ADD NEW USER
sudo usermod -G hadoop hduser

CHECK FOR EXISTING USER
id hduser

CREATE PWD FOR THE USER
sudo passwd hduser

CHANGE THE OWNERSHIP FOR THAT FILE
hdfs dfs -chown -R hduser:hadoop /hackerday_ratings

Login as root


hdfs dfs -chown -R hduser:hadoop /hackerday_ratings

CHECK FOR OWNERSHIP CHANGES
hdfs dfs -ls /


HIVE

CREATE DATABASE

CREATE DATABASE  hackerday_ratings;
use hackerdayday_ratings;
create table sample(id int);
insert into table sample values(2);

select * from sample;


WE CAN LOGIN TO HIVE USING THIS COMMAND TO
sudo -u hdfs hive

CREATE A SAMPLE FILE
vi sample.txt
ctrl+I
type
1 student1
2 student2

press esc + :wq

MOVE THE FILE TO HDFS
hdfs dfs -put sample.txt /hackerday_ratings

CHECK THE SIZE OF THE INDIVIDUAL FILES IN THE DIRECTORY
:vagrant/lambda$ hdfs dfs -du -h /hackerday_ratings


CHECK IF FILE EXISTS
hdfs dfs -ls /hackerday_ratings

fields terminated by ' ' \\How rows are 
lines terminated by '\n'\\ How lines are seperated
drop table if exists sample;
create table sample
(id int, name varchar(10))
row format delimited
fields terminated by ' '  
lines terminated by '\n'\\ 
location 'hdfs://172.31.21.255:8020/hackerday_ratings/';  


Note we would get the ip when we connect to EMR using putty

INCREASING PERFORMANCE IN HIVE
  • By default hive runs as a map reduce job
  • Data gets into the mapper
  • Mapper data is stored into the local file system
  • From local file system it would go to the reducer
  • From the reducer it would go to the final ouput
Note: After every mapper phase data gets spilled to the disk
      When the reducer wants to capture the data the hardware would have its own IO

To increase performance  and reduce the overhead tez was built
Tez which would have the output of the mapper phase in memory

WE CAN USE SET AND CHANGE THE EXECUTION ENGINE
SET hive.execution.engine=mr;
 hive.execution.engine=tez;

SET COMMAND
  • Used for changing configuration in hivemapper task
  • set the usage of the cluster

CHANGE MAP REDUCE TASK
 If we want to change map reduce task
 Note: we can specify how many task we want to run
       By default it is -1 (there would be no limit to it)
  SET  mapred.reduce.task

HIVE CONFIGURATION
Hive configuration can be specifed in the configuration file

LOAD DATA DIRECTLY USING HADOOP DICKCP COMMAND
This option is used to copy data across clusters

FETCH DATA USING WGET
IF data is available as api we can use wget

 wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
 wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
 wget http://files.grouplens.org/datasets/movielens/ml-latest.zip

UNZIP FILES

unzip ml-100k.zip
unzip ml-1m.zip
unzip ml-latest.zip

CREATE DIRECTORY IN HDFS

 hdfs dfs -mkdir /hackerday_ratings/100k
 hdfs dfs -mkdir /hackerday_ratings/million
 hdfs dfs -mkdir /hackerday_ratings/latest

RENAME LOCAL DIRECTORIES
mv ml-100k 100k
mv ml-1m million
mv  ml-latest latest

MAKE DIRECTORIES FOR THE DATASET
hdfs dfs -mkdir /hackerday_ratings/100k/data
hdfs dfs -mkdir /hackerday_ratings/100k/item
hdfs dfs -mkdir /hackerday_ratings/100k/info
hdfs dfs -mkdir /hackerday_ratings/100k/genre
hdfs dfs -mkdir /hackerday_ratings/100k/user
hdfs dfs -mkdir /hackerday_ratings/100k/occupation

COPY DATA FILE FROM LOCAL TO HDFS LOCATION
hdfs dfs -copyFromLocal /home/hadoop/100k/u.data /hackerday_ratings/100k/data
hdfs dfs -copyFromLocal /home/hadoop/100k/u.item /hackerday_ratings/100k/item
hdfs dfs -copyFromLocal /home/hadoop/100k/u.info /hackerday_ratings/100k/info
hdfs dfs -copyFromLocal /home/hadoop/100k/u.genre /hackerday_ratings/100k/genre
hdfs dfs -copyFromLocal /home/hadoop/100k/u.user /hackerday_ratings/100k/user
hdfs dfs -copyFromLocal /home/hadoop/100k/u.data /hackerday_ratings/100k/occupation


 GET THE NECESSARY JAR FILES
 wget https://s3.amazonaws.com/hackerday.bigdata/37/hive-serdes-1.0-SNAPSHOT.jar
 wget https://s3.amazonaws.com/hackerday.bigdata/37/flume-sources-1.0-SNAPSHOT.jar

 FETCH CSV SERDE
 wget https://github.com/downloads/ogrodnek/csv-serde/csv-serde-1.1.2.jar

 COPY CSV SERDE TO HDFS
 hdfs dfs -copyFromLocal /home/hadoop/csv-serde-1.1.2.jar  /hackerday_ratings/

DATA


KNOW THE DISTRIBUTION OF THE FILE
hdfs fsck  /hackerday_ratings/100k/u.data -blocks -locations


DATA

READ TOP 10 OR BOTTOM TEN USING HEAD OR TAIL
 hdfs dfs -cat /hackerday_ratings/100k/data/u.data | head -n 10

note time is in number format it has to be converted into timestamp

CREATE AN EXTERNAL TABLE FOR DATA

drop table if exists 100k_info;
create external
table 100k_data(
user_id int,
item_id int,
rating double,
rating_time bigint)
row format delimited
fields terminated by '\t'
lines terminated by '\n'
location '/hackerday_ratings/100k/data';

FETCH DATA WITH LIMIT
select * from 100k_data limit 10;

Output shown in hue

ITEMS

CHECK IF THE DATA EXISTS AT THE SPECIFIED LOCATION
 hdfs dfs -cat /hackerday_ratings/100k/item/u.item | head -n 10

drop table if exists 100k_item;
create external table 100k_item
(
movie_id int,
movie_title varchar(100),
release_date string,
video_release_date string,
IMDB_URL varchar(5000),
unknown tinyint,
Action tinyint,
Adventure tinyint,
Animation tinyint,
Children tinyint,
Comedy tinyint,
Crime tinyint,
Documentry tinyint,
Drama tinyint,
Fantasy tinyint,
Film_Noir tinyint,
Horror tinyint,
Musical tinyint,
Mystery tinyint,
Romance tinyint,
Sci_Fi tinyint,
Thriller tinyint,
War tinyint,
Western tinyint
)
row format delimited
fields terminated by '||'
lines terminated by '\n'
location '/hackerday_ratings/100k/item';

Output shown in hue 

INFO

Analyze the existing dataset
hdfs dfs -cat /hackerday_ratings/100k/info/u.info | head -n 10
943 users
1682 items
100000 ratings


CREATE EXTERNAL TABLE
drop table if exists 100k_info;
create external table 100k_info (
no int,
type varchar(100)
)
row format delimited
fields terminated by ' '
lines terminated by '\n'
location '/hackerday_ratings/100k/info';

SELECT * FROM  100K_info LIMIT 10


Output shown in hue

GENRE

Analyze existing dataset
 hdfs dfs -cat /hackerday_ratings/100k/genre/u.genre | head -n 10

CREATE EXTERNAL TABLE IN HIVE
drop table if exists 100k_genre;
create external table 100k_genre (
type varchar(100),
value tinyint
)
row format delimited
fields terminated by '|'
lines terminated by '\n'
location '/hackerday_ratings/100k/genre';

SELECT * FROM  100K_genre LIMIT 10

Output shown in hue

USER 

Analyze the data
 hdfs dfs -cat /hackerday_ratings/100k/user/u.user | head -n 10

CREATE EXTERNAL TABLE FOR USER
drop table if exists 100k_user;
create external table 100k_user (
user_id varchar(100),
age tinyint,
gender varchar(5),
occupation varchar(100),
zipcode int

)
row format delimited
fields terminated by '|'
lines terminated by '\n'
location '/hackerday_ratings/100k/user';

SELECT * FROM  100K_user LIMIT 10

Output shown in hue

OCCUPATION 

Analyze the data
hdfs dfs -cat /hackerday_ratings/100k/occupation/u.occupation | head -n 10

CREATE EXTERNAL TABLE FOR OCCUPATION
drop table if exists 100k_occupation;
create external table 100k_occupation (
type varchar(100)
)
row format delimited
fields terminated by ''
lines terminated by '\n'
location '/hackerday_ratings/100k/occupation';

SELECT * FROM  100K_occupation LIMIT 10

Output shown in hue

SKIP LINES WHILE LOADING IN EXTERNAL TABLE
Here we are skipping the last line.Here we are eliminating last row

drop table if exists 100k_genre;
create external table 100k_genre (
type varchar(100),
value tinyint
)
row format delimited
fields terminated by '|'
lines terminated by '\n'
location '/hackerday_ratings/100k/genre'
TBLPROPERTIES ("skip.footer.line.count" = "1");

CONVERT BIGTINT TO UNIXTIME

select from_unixtime(rating_time) from 100k_data limit 10;

Output shown in hue

1 comment: