Wednesday, July 25, 2018

File format using hive




SEQUENCE FILE
1
2
3
4
5
6
7
Sequencefile
======================
create external table flight_seq 
 (year smallint,month tinyint,dayofmonth tinyint,dayofweek tinyint,
  lateaircraftdelay smallint)
 stored as sequencefile
location '/user/raj_ops/rawdata/handson_train/airline_performance/flights_seq';




AVRO FILE
1
2
3
4
5
6
7
avro
======================
create external table flight_avro 
 (year smallint,month tinyint,dayofmonth tinyint,dayofweek tinyint,
  lateaircraftdelay smallint)
 stored as avro
location '/user/raj_ops/rawdata/handson_train/airline_performance/flights_avro';



ORC
1
2
3
4
5
6
7
orc
======================
create external table flight_orc 
 (year smallint,month tinyint,dayofmonth tinyint,dayofweek tinyint,
  lateaircraftdelay smallint)
 stored as orc
location '/user/raj_ops/rawdata/handson_train/airline_performance/flights_orc';

PARQUET
1
2
3
4
5
6
7
parquet
======================
create external table flight_pq 
 (year smallint,month tinyint,dayofmonth tinyint,dayofweek tinyint,
  lateaircraftdelay smallint)
 stored as parquet
location '/user/raj_ops/rawdata/handson_train/airline_performance/flights_pq';

READING
//reading
select year, count(1) from flight_raw group by year;
select year, count(1) from flight_seq group by year;
select year, count(1) from flight_avro group by year;
set mapred.reduce.tasks=6;
select year, count(1) from flight_orc group by year;
set mapred.reduce.tasks=10;
select year, count(1) from flights_pq group by year;
set mapred.reduce.tasks=-;1


//reading with predicate
select count(1) from flight_raw where year = 2007;
select count(1) from flight_seq where year = 2007;
select count(1) from flight_avro where year = 2007;
select count(1) from flight_orc where year = 2007;
select count(1) from flights_pq where year = 2007;

No comments:

Post a Comment