Hive function advanced

  • Explodes (UDTF)

    • NBA championship team
    create table the_nba_championship(
        team_name string,
        champion_year array<string>
    ) row format delimited
    fields terminated by ', '
    collection items terminated by '|';
    
    -- Split champion_year using explode function
    
    Copy the code
  • lateral View

  • Row and column conversion concat

    Column is

    select count(case when gender='male' then id end) as 'male'.count(case when gender='woman' then id end) as 'woman' from students
    Copy the code
    male female
    30 40

    Line to compare

    select gender,count(id) as count from student group by gender
    Copy the code
    gender count
    male 30
    female 40
    • Transfer line column
    create table row2col2(
       col1 string,
       col2 string,
       col3 int
    )row format delimited fields terminated by '\t';
    Copy the code
    • Column turned
    create table col2row2(
       col1 string,
       col2 string,
       col3 string
    )row format delimited fields terminated by '\t';
    Copy the code
  • Json data processing

    • A single field
    create table tb_json_test1 (
      json string
    );
    
    get_json_object(json,'$.deviceType')
    Copy the code
    • Multiple fields
    create table tb_json_test2 (
       device string,
       deviceType string,
       signal double,
       `time` string
     )
    ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
    STORED AS TEXTFILE;
    
     json_tuple(json,'device'.'deviceType')
    Copy the code
    • serder
    create table tb_json_test2 (
       device string,
       deviceType string,
       signal double,
       `time` string
     )
    ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
    STORED AS TEXTFILE;
    Copy the code
    XML function extended xpath; SELECT xpath('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>', 'a/b/text()')Copy the code

The window function

Over (partition by (1)); Order by column 1… Rows range),

  • Build table
-- Create tables and load data
create table website_pv_info(
   cookieid string,
   createtime string,   --day
   pv int
) row format delimited
fields terminated by ', ';

create table website_url_info (
    cookieid string,
    createtime string,  -- Access time
    url string       -- Access page
) row format delimited
fields terminated by ', '
Copy the code
  • Sum +group by Common Common aggregation operation
select cookieid,sum(pv) from website_pv_info group by cookieid;
Copy the code
  • So let’s figure out the total PV of the site for all the users and all the visits
Copy the code
  • Figure out the total PV of each user
select cookieid,sum(pv) over(partition by cookieid) from website_pv_info;
Copy the code
  • Figure out the total pv number accumulated by the end of the day for each user
-- Calculate the total pv number accumulated by each user on the day between unbounded preceding and current row equivalent to unbounded preceding
-- Order by default has a calculation range unbounded preceding
select cookieid,sum(pv) over(partition by cookieid order by createtime rows between unbounded preceding and current row ) from website_pv_info;
select cookieid,sum(pv) over(partition by cookieid order by createtime rows unbounded preceding) from website_pv_info;
select cookieid,sum(pv) over(partition by cookieid order by createtime ) from website_pv_info;
Copy the code
  • Custom window ranges
unbounded preceding
Copy the code
  • Find the Top3 repeated juxtapositions for the most pv visits per user regardless
Dense_rank Specifies the row number. Row_number specifies the row number
select *.rank(a)over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *.dense_rank(a)over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *.row_number(a)over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *.ntile(3)  over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;


with tb1 as ( select *.dense_rank(a)over (partition by cookieid order by pv desc ) as rank_num from website_pv_info )
from tb1
select *
 where rank_num < = 3;
Copy the code
  • Collect statistics on the first third of the pv number of each user
with tb1 as (select *.ntile(3)  over (partition by cookieid order by pv desc ) as rank_num from website_pv_info)
from tb1
select *
 where rank_num = 1;
Copy the code
  • Analysis of the function
select *.lag(pv)  over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
-- Value down
select *.lead(pv)  over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
-- take the first value
select *.first_value(pv)  over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
Note the value range because the value range used by Order Derby needs to be added down
select *.last_value(pv)  over (partition by cookieid order by pv desc rows between unbounded preceding and unbounded following) as rank_num from website_pv_info;


select sum(pv) sum(sum(pv)) over(a)from website_pv_info
Copy the code

Data compression

  • Compression algorithm
  • The map compression
1Enable the data compression function for hive intermediate transmissionset hive.exec.compress.intermediate=true;
2Enable the map output compression function in MapReduceset mapreduce.map.output.compress=true;
3) Sets the compression mode for map output data in MapReduceSet mapreduce.map.output.compress.codec= org.apache.hadoop.io.compress.SnappyCodec;
Copy the code
  • Reduce the compression
1) Enable the compression function for hive final output dataset hive.exec.compress.output=true;
2) Enables mapReduce final output data compressionset mapreduce.output.fileoutputformat.compress=true;
3) Sets the compression mode for mapReduce final data outputset mapreduce.output.fileoutputformat.compress.codec = org.apache.hadoop.io.compress.SnappyCodec;
4) Sets mapReduce final data output compression to block compressionset mapreduce.output.fileoutputformat.compress.type=BLOCK;
Copy the code

Data storage format

  • Row storage and column storage
  • TextFILE
  • ORC
  • PAROUET
  • Comparison format

TEXTfile

create table log_text (
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE ;
Copy the code

orc

create table log_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc ;
Copy the code

PARQUET

create table log_parquet(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET ;	
Copy the code