link to github
ADD USER
sudo useradd hduser
CREATE DIRECTORY
hdfs dfs -mkdir /hackerday_ratings
LIST CREATED DIRECTORY
hdfs dfs -ls /
ADD NEW USER
sudo usermod -G hadoop hduser
CHECK FOR EXISTING USER
id hduser
CREATE PWD FOR THE USER
sudo passwd hduser
CHANGE THE OWNERSHIP FOR THAT FILE
hdfs dfs -chown -R hduser:hadoop /hackerday_ratings
Login as root
CHECK FOR OWNERSHIP CHANGES
hdfs dfs -ls /
WE CAN LOGIN TO HIVE USING THIS COMMAND TO
sudo -u hdfs hive
CREATE A SAMPLE FILE
vi sample.txt
ctrl+I
type
1 student1
2 student2
press esc + :wq
MOVE THE FILE TO HDFS
hdfs dfs -put sample.txt /hackerday_ratings
CHECK THE SIZE OF THE INDIVIDUAL FILES IN THE DIRECTORY
:vagrant/lambda$ hdfs dfs -du -h /hackerday_ratings
CHECK IF FILE EXISTS
hdfs dfs -ls /hackerday_ratings
Note we would get the ip when we connect to EMR using putty
INCREASING PERFORMANCE IN HIVE
When the reducer wants to capture the data the hardware would have its own IO
To increase performance and reduce the overhead tez was built
Tez which would have the output of the mapper phase in memory
WE CAN USE SET AND CHANGE THE EXECUTION ENGINE
SET COMMAND
CHANGE MAP REDUCE TASK
If we want to change map reduce task
Note: we can specify how many task we want to run
By default it is -1 (there would be no limit to it)
SET mapred.reduce.task
HIVE CONFIGURATION
Hive configuration can be specifed in the configuration file
LOAD DATA DIRECTLY USING HADOOP DICKCP COMMAND
This option is used to copy data across clusters
FETCH DATA USING WGET
IF data is available as api we can use wget
UNZIP FILES
CREATE DIRECTORY IN HDFS
hdfs dfs -mkdir /hackerday_ratings/100k
hdfs dfs -mkdir /hackerday_ratings/million
hdfs dfs -mkdir /hackerday_ratings/latest
RENAME LOCAL DIRECTORIES
mv ml-100k 100k
mv ml-1m million
mv ml-latest latest
COPY DATA FILE FROM LOCAL TO HDFS LOCATION
hdfs dfs -copyFromLocal /home/hadoop/100k/u.data /hackerday_ratings/100k/data
KNOW THE DISTRIBUTION OF THE FILE
hdfs fsck /hackerday_ratings/100k/u.data -blocks -locations
FETCH DATA WITH LIMIT
select * from 100k_data limit 10;
Output shown in hue
CREATE EXTERNAL TABLE
sudo useradd hduser
CREATE DIRECTORY
hdfs dfs -mkdir /hackerday_ratings
LIST CREATED DIRECTORY
hdfs dfs -ls /
ADD NEW USER
sudo usermod -G hadoop hduser
CHECK FOR EXISTING USER
id hduser
CREATE PWD FOR THE USER
sudo passwd hduser
CHANGE THE OWNERSHIP FOR THAT FILE
hdfs dfs -chown -R hduser:hadoop /hackerday_ratings
Login as root
hdfs dfs -chown -R hduser:hadoop /hackerday_ratings
CHECK FOR OWNERSHIP CHANGES
hdfs dfs -ls /
HIVE
CREATE DATABASECREATE DATABASE hackerday_ratings; use hackerdayday_ratings; create table sample(id int); insert into table sample values(2); select * from sample;
WE CAN LOGIN TO HIVE USING THIS COMMAND TO
sudo -u hdfs hive
CREATE A SAMPLE FILE
vi sample.txt
ctrl+I
type
1 student1
2 student2
press esc + :wq
MOVE THE FILE TO HDFS
hdfs dfs -put sample.txt /hackerday_ratings
CHECK THE SIZE OF THE INDIVIDUAL FILES IN THE DIRECTORY
:vagrant/lambda$ hdfs dfs -du -h /hackerday_ratings
CHECK IF FILE EXISTS
hdfs dfs -ls /hackerday_ratings
fields terminated by ' ' \\How rows are
lines terminated by '\n'\\ How lines are seperated
drop table if exists sample; create table sample (id int, name varchar(10)) row format delimited fields terminated by ' ' lines terminated by '\n'\\ location 'hdfs://172.31.21.255:8020/hackerday_ratings/';
Note we would get the ip when we connect to EMR using putty
INCREASING PERFORMANCE IN HIVE
- By default hive runs as a map reduce job
- Data gets into the mapper
- Mapper data is stored into the local file system
- From local file system it would go to the reducer
- From the reducer it would go to the final ouput
When the reducer wants to capture the data the hardware would have its own IO
To increase performance and reduce the overhead tez was built
Tez which would have the output of the mapper phase in memory
WE CAN USE SET AND CHANGE THE EXECUTION ENGINE
SET hive.execution.engine=mr; hive.execution.engine=tez;
SET COMMAND
- Used for changing configuration in hivemapper task
- set the usage of the cluster
CHANGE MAP REDUCE TASK
If we want to change map reduce task
Note: we can specify how many task we want to run
By default it is -1 (there would be no limit to it)
SET mapred.reduce.task
HIVE CONFIGURATION
Hive configuration can be specifed in the configuration file
LOAD DATA DIRECTLY USING HADOOP DICKCP COMMAND
This option is used to copy data across clusters
FETCH DATA USING WGET
IF data is available as api we can use wget
wget http://files.grouplens.org/datasets/movielens/ml-1m.zip wget http://files.grouplens.org/datasets/movielens/ml-100k.zip wget http://files.grouplens.org/datasets/movielens/ml-latest.zip
UNZIP FILES
unzip ml-100k.zip unzip ml-1m.zip unzip ml-latest.zip
CREATE DIRECTORY IN HDFS
hdfs dfs -mkdir /hackerday_ratings/100k
hdfs dfs -mkdir /hackerday_ratings/million
hdfs dfs -mkdir /hackerday_ratings/latest
RENAME LOCAL DIRECTORIES
mv ml-100k 100k
mv ml-1m million
mv ml-latest latest
MAKE DIRECTORIES FOR THE DATASET
hdfs dfs -mkdir /hackerday_ratings/100k/data
hdfs dfs -mkdir /hackerday_ratings/100k/item
hdfs dfs -mkdir /hackerday_ratings/100k/info
hdfs dfs -mkdir /hackerday_ratings/100k/genre
hdfs dfs -mkdir /hackerday_ratings/100k/user
hdfs dfs -mkdir /hackerday_ratings/100k/occupation
hdfs dfs -mkdir /hackerday_ratings/100k/data
hdfs dfs -mkdir /hackerday_ratings/100k/item
hdfs dfs -mkdir /hackerday_ratings/100k/info
hdfs dfs -mkdir /hackerday_ratings/100k/genre
hdfs dfs -mkdir /hackerday_ratings/100k/user
hdfs dfs -mkdir /hackerday_ratings/100k/occupation
COPY DATA FILE FROM LOCAL TO HDFS LOCATION
hdfs dfs -copyFromLocal /home/hadoop/100k/u.data /hackerday_ratings/100k/data
hdfs dfs -copyFromLocal /home/hadoop/100k/u.item /hackerday_ratings/100k/item
hdfs dfs -copyFromLocal /home/hadoop/100k/u.info /hackerday_ratings/100k/info
hdfs dfs -copyFromLocal /home/hadoop/100k/u.genre /hackerday_ratings/100k/genre
hdfs dfs -copyFromLocal /home/hadoop/100k/u.user /hackerday_ratings/100k/user
hdfs dfs -copyFromLocal /home/hadoop/100k/u.data /hackerday_ratings/100k/occupation
GET THE NECESSARY JAR FILES
wget https://s3.amazonaws.com/hackerday.bigdata/37/hive-serdes-1.0-SNAPSHOT.jar
wget https://s3.amazonaws.com/hackerday.bigdata/37/flume-sources-1.0-SNAPSHOT.jar
FETCH CSV SERDE
wget https://github.com/downloads/ogrodnek/csv-serde/csv-serde-1.1.2.jar
COPY CSV SERDE TO HDFS
hdfs dfs -copyFromLocal /home/hadoop/csv-serde-1.1.2.jar /hackerday_ratings/
wget https://s3.amazonaws.com/hackerday.bigdata/37/hive-serdes-1.0-SNAPSHOT.jar
wget https://s3.amazonaws.com/hackerday.bigdata/37/flume-sources-1.0-SNAPSHOT.jar
FETCH CSV SERDE
wget https://github.com/downloads/ogrodnek/csv-serde/csv-serde-1.1.2.jar
COPY CSV SERDE TO HDFS
hdfs dfs -copyFromLocal /home/hadoop/csv-serde-1.1.2.jar /hackerday_ratings/
DATA
KNOW THE DISTRIBUTION OF THE FILE
hdfs fsck /hackerday_ratings/100k/u.data -blocks -locations
DATA
READ TOP 10 OR BOTTOM TEN USING HEAD OR TAIL
hdfs dfs -cat /hackerday_ratings/100k/data/u.data | head -n 10
note time is in number format it has to be converted into timestamp
CREATE AN EXTERNAL TABLE FOR DATA
drop table if exists 100k_info;
create external table 100k_data( user_id int, item_id int, rating double, rating_time bigint) row format delimited fields terminated by '\t' lines terminated by '\n' location '/hackerday_ratings/100k/data';
FETCH DATA WITH LIMIT
select * from 100k_data limit 10;
Output shown in hue
ITEMS
CHECK IF THE DATA EXISTS AT THE SPECIFIED LOCATION
hdfs dfs -cat /hackerday_ratings/100k/item/u.item | head -n 10drop table if exists 100k_item; create external table 100k_item ( movie_id int, movie_title varchar(100), release_date string, video_release_date string, IMDB_URL varchar(5000), unknown tinyint, Action tinyint, Adventure tinyint, Animation tinyint, Children tinyint, Comedy tinyint, Crime tinyint, Documentry tinyint, Drama tinyint, Fantasy tinyint, Film_Noir tinyint, Horror tinyint, Musical tinyint, Mystery tinyint, Romance tinyint, Sci_Fi tinyint, Thriller tinyint, War tinyint, Western tinyint ) row format delimited fields terminated by '||' lines terminated by '\n' location '/hackerday_ratings/100k/item';
Output shown in hue
INFO
Analyze the existing dataset
hdfs dfs -cat /hackerday_ratings/100k/info/u.info | head -n 10 943 users 1682 items 100000 ratings
CREATE EXTERNAL TABLE
drop table if exists 100k_info; create external table 100k_info ( no int, type varchar(100) ) row format delimited fields terminated by ' ' lines terminated by '\n' location '/hackerday_ratings/100k/info'; SELECT * FROM 100K_info LIMIT 10
Output shown in hue
GENRE
Analyze existing dataset
hdfs dfs -cat /hackerday_ratings/100k/genre/u.genre | head -n 10
CREATE EXTERNAL TABLE IN HIVE
drop table if exists 100k_genre; create external table 100k_genre ( type varchar(100), value tinyint ) row format delimited fields terminated by '|' lines terminated by '\n' location '/hackerday_ratings/100k/genre'; SELECT * FROM 100K_genre LIMIT 10
Output shown in hue
USER
Analyze the data hdfs dfs -cat /hackerday_ratings/100k/user/u.user | head -n 10
CREATE EXTERNAL TABLE FOR USER
drop table if exists 100k_user; create external table 100k_user ( user_id varchar(100), age tinyint, gender varchar(5), occupation varchar(100), zipcode int ) row format delimited fields terminated by '|' lines terminated by '\n' location '/hackerday_ratings/100k/user'; SELECT * FROM 100K_user LIMIT 10
Output shown in hue
OCCUPATION
Analyze the datahdfs dfs -cat /hackerday_ratings/100k/occupation/u.occupation | head -n 10
CREATE EXTERNAL TABLE FOR OCCUPATION
drop table if exists 100k_occupation; create external table 100k_occupation ( type varchar(100) ) row format delimited fields terminated by '' lines terminated by '\n' location '/hackerday_ratings/100k/occupation'; SELECT * FROM 100K_occupation LIMIT 10
Output shown in hue
SKIP LINES WHILE LOADING IN EXTERNAL TABLE
Here we are skipping the last line.Here we are eliminating last row
drop table if exists 100k_genre; create external table 100k_genre ( type varchar(100), value tinyint ) row format delimited fields terminated by '|' lines terminated by '\n' location '/hackerday_ratings/100k/genre' TBLPROPERTIES ("skip.footer.line.count" = "1");
CONVERT BIGTINT TO UNIXTIME
select from_unixtime(rating_time) from 100k_data limit 10;
Output shown in hue
I wanted to thank for sharing this article and I have bookmarked this page to check out new stuff.
ReplyDeleteAWS Training in Chennai
AWS Training institute in Chennai
Best AWS Training in Chennai
VMware Training in Chennai
VMware course in Chennai
DevOps Training in Chennai
DevOps Training institute in Chennai
DevOps Training in Anna Nagar