Hive function advanced
-
Explodes (UDTF)
- NBA championship team
create table the_nba_championship( team_name string, champion_year array<string> ) row format delimited fields terminated by ', ' collection items terminated by '|'; -- Split champion_year using explode function Copy the code
-
lateral View
-
Row and column conversion concat
Column is
select count(case when gender='male' then id end) as 'male'.count(case when gender='woman' then id end) as 'woman' from students Copy the code
male female 30 40 Line to compare
select gender,count(id) as count from student group by gender Copy the code
gender count male 30 female 40 - Transfer line column
create table row2col2( col1 string, col2 string, col3 int )row format delimited fields terminated by '\t'; Copy the code
- Column turned
create table col2row2( col1 string, col2 string, col3 string )row format delimited fields terminated by '\t'; Copy the code
-
Json data processing
- A single field
create table tb_json_test1 ( json string ); get_json_object(json,'$.deviceType') Copy the code
- Multiple fields
create table tb_json_test2 ( device string, deviceType string, signal double, `time` string ) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS TEXTFILE; json_tuple(json,'device'.'deviceType') Copy the code
- serder
create table tb_json_test2 ( device string, deviceType string, signal double, `time` string ) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS TEXTFILE; Copy the code
XML function extended xpath; SELECT xpath('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>', 'a/b/text()')Copy the code
The window function
Over (partition by (1)); Order by column 1… Rows range),
- Build table
-- Create tables and load data
create table website_pv_info(
cookieid string,
createtime string, --day
pv int
) row format delimited
fields terminated by ', ';
create table website_url_info (
cookieid string,
createtime string, -- Access time
url string -- Access page
) row format delimited
fields terminated by ', '
Copy the code
- Sum +group by Common Common aggregation operation
select cookieid,sum(pv) from website_pv_info group by cookieid;
Copy the code
- So let’s figure out the total PV of the site for all the users and all the visits
Copy the code
- Figure out the total PV of each user
select cookieid,sum(pv) over(partition by cookieid) from website_pv_info;
Copy the code
- Figure out the total pv number accumulated by the end of the day for each user
-- Calculate the total pv number accumulated by each user on the day between unbounded preceding and current row equivalent to unbounded preceding
-- Order by default has a calculation range unbounded preceding
select cookieid,sum(pv) over(partition by cookieid order by createtime rows between unbounded preceding and current row ) from website_pv_info;
select cookieid,sum(pv) over(partition by cookieid order by createtime rows unbounded preceding) from website_pv_info;
select cookieid,sum(pv) over(partition by cookieid order by createtime ) from website_pv_info;
Copy the code
- Custom window ranges
unbounded preceding
Copy the code
- Find the Top3 repeated juxtapositions for the most pv visits per user regardless
Dense_rank Specifies the row number. Row_number specifies the row number
select *.rank(a)over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *.dense_rank(a)over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *.row_number(a)over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *.ntile(3) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
with tb1 as ( select *.dense_rank(a)over (partition by cookieid order by pv desc ) as rank_num from website_pv_info )
from tb1
select *
where rank_num < = 3;
Copy the code
- Collect statistics on the first third of the pv number of each user
with tb1 as (select *.ntile(3) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info)
from tb1
select *
where rank_num = 1;
Copy the code
- Analysis of the function
select *.lag(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
-- Value down
select *.lead(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
-- take the first value
select *.first_value(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
Note the value range because the value range used by Order Derby needs to be added down
select *.last_value(pv) over (partition by cookieid order by pv desc rows between unbounded preceding and unbounded following) as rank_num from website_pv_info;
select sum(pv) sum(sum(pv)) over(a)from website_pv_info
Copy the code
Data compression
- Compression algorithm
- The map compression
1Enable the data compression function for hive intermediate transmissionset hive.exec.compress.intermediate=true;
2Enable the map output compression function in MapReduceset mapreduce.map.output.compress=true;
3) Sets the compression mode for map output data in MapReduceSet mapreduce.map.output.compress.codec= org.apache.hadoop.io.compress.SnappyCodec;
Copy the code
- Reduce the compression
1) Enable the compression function for hive final output dataset hive.exec.compress.output=true;
2) Enables mapReduce final output data compressionset mapreduce.output.fileoutputformat.compress=true;
3) Sets the compression mode for mapReduce final data outputset mapreduce.output.fileoutputformat.compress.codec = org.apache.hadoop.io.compress.SnappyCodec;
4) Sets mapReduce final data output compression to block compressionset mapreduce.output.fileoutputformat.compress.type=BLOCK;
Copy the code
Data storage format
- Row storage and column storage
- TextFILE
- ORC
- PAROUET
- Comparison format
TEXTfile
create table log_text (
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE ;
Copy the code
orc
create table log_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc ;
Copy the code
PARQUET
create table log_parquet(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET ;
Copy the code