指标计算"/>
Hive 部分业务数据指标计算
需求:统计连续出现三次的数据的id
文件名称:series.txt
数据格式如下:
id,number
1,1
2,1
3,1
4,2
5,1
6,2
7,2
8,3
9,3
10,3
11,3
12,4
思路:可以做一个自连接,使用笛卡尔积的方式来实现
CREATE TABLE series(
id int ,
number int
)ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' set hive.fetch.task.conversion=more;
第一步先自连接:
select * from series a join series b on a.id = b.id-1
join series c on b.id=c.id-1;a.id a.number b.id b.number c.id c.number
1 1 2 1 3 1
2 1 3 1 4 2
3 1 4 2 5 1
4 2 5 1 6 2
5 1 6 2 7 2
6 2 7 2 8 3
7 2 8 3 9 3
8 3 9 3 10 3
9 3 10 3 11 3
10 3 11 3 12 4
第二部:加上筛选条件
select a.id,distinct(a.number) from series a
join series b on a.id = b.id-1 join series c on b.id=c.id-1
where a.number=b.number and b.number=c.numbera.id a.number b.id b.number c.id c.number
1 1 2 1 3 1
8 3 9 3 10 3
9 3 10 3 11 3
最终结结果只需去重即可
数据文件 score.txt
需求:统计每个学生成绩最好的课程和分数、最差的课程和分数、求平均分
数据样本:
peck|math:90,english:100,sport:80
alice|math:80,english:99,sport:85
tom|math:62,english:99,sport:88
采用Map类型创建表,使用collection items 指定列分隔符,使用Map 指定数据的key分割符
create table score(
name string,
score map<string, int>
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':';
查询数据:
load data local inpath '/home/hadoop/data/hive/score.txt' into table score;hive (default)> select * from scorescore.name score.score
peck {"math":90,"english":100,"sport":80}
alice {"math":80,"english":99,"sport":85}
tom {"math":62,"english":99,"sport":88}
Time taken: 0.046 seconds, Fetched: 3 row(s)select name,score['math'] math,score['english'] english,score['sport'] sport from score
OK
name math english sport
peck 90 100 80
alice 80 99 85
tom 62 99 88
使用explode函数炸开数据
select explode(score) as (course,score) from score;hive (default)> select explode(score) as (course,score) from score;
course score
math 90
english 100
sport 80
math 80
english 99
sport 85
math 62
english 99
sport 88
使用lateral view 进行炸裂
create table result_tmp as
select ss.name,t.course,t.score
from
score ss
lateral view explode(ss.score) t as course,score;ss.name t.course t.score
peck math 90
peck english 100
peck sport 80
alice math 80
alice english 99
alice sport 85
tom math 62
tom english 99
tom sport 88
进一步操作得到结果数据
select ss.name,max(t.score) max_score,min(t.score) min_score,avg(t.score) avg_score
from
score ss
lateral view explode(ss.score) t as course,score
group by ss.namess.name max_score min_score avg_score
alice 99 80 88.0
peck 100 80 90.0
tom 99 62 83.0select a.name,b.course,a.max_score,a.min_score,a.avg_score
from (
select ss.name,max(t.score) max_score,min(t.score) min_score,avg(t.score) avg_score
from
score ss
lateral view explode(ss.score) t as course,score
group by ss.name ) a
join
(
select ss.name,t.course,t.score
from
score ss
lateral view explode(ss.score) t as course,score) b
on a.name = b.name and a.max_score=b.score ;
换一种写法:使用With进行操作
with
a as (select ss.name,max(t.score) max_score,min(t.score) min_score,avg(t.score) avg_score from score ss lateral view explode(ss.score) t as course,score group by ss.name),
b as (select ss.name,t.course,t.score from score ss lateral view explode(ss.score) t as course,score)
select a.name,b.course,a.max_score,a.min_score,a.avg_score
from a join b
on a.name = b.name and a.max_score=b.score ;OK
a.name b.course a.max_score a.min_score a.avg_score
alice english 99 80 88.0
peck english 100 80 90.0
tom english 99 62 83.0
将数据进行格式的转换
create table result_tmp as
select ss.name,t.course,t.score
from
score ss
lateral view explode(ss.score) t as course,score;ss.name t.course t.score
peck math 90
peck english 100
peck sport 80
alice math 80
alice english 99
alice sport 85
tom math 62
tom english 99
tom sport 88select t.name ,collect_list(concat_ws("-",t.course,cast(t.score as string))) as score
from result_tmp t
group by t.nameOK
t.name score
alice ["math-80","english-99","sport-85"]
peck ["math-90","english-100","sport-80"]
tom ["math-62","english-99","sport-88"]将上述拆分出来
select s.name,
split(s.score[0],"-")[1] as math_score,
split(s.score[1],"-")[1] as english_score,
split(s.score[2],"-")[1] as sport_score
from
(select t.name ,collect_list(concat_ws("-",t.course,cast(t.score as string))) as score
from result_tmp t
group by t.name) ss.name math_score english_score sport_score
alice 80 99 85
peck 90 100 80
tom 62 99 88
上述是行转列
需求:每个域名截止到每个月 最大的访问次数 累计到月的总访问次数
数据形式:
google.com,2020-01-02,5
google.com,2020-01-03,15
baidu.com,2020-01-01,5
youdao.com,2020-01-04,8
google.com,2020-01-02,25
google.com,2020-01-05,5
baidu.com,2020-02-02,4
youdao.com,2020-02-02,6
google.com,2020-02-02,10
google.com,2020-02-02,5
baidu.com,2020-02-02,16
youdao.com,2020-02-02,22
google.com,2020-03-02,23
google.com,2020-03-02,10
baidu.com,2020-03-02,11
youdao.com,2020-03-02,13pv max sum
google.com,2020-01,50 50 50
google.com,2020-02,15 50 65
google.com,2020-03,33 50 98
创建表
create table access(
domain string,
day string,
pv int
)row format delimited fields terminated by ',';
加载数据
load data local inpath '/home/hadoop/data/hive/access.txt' into table accesshive (default)> select * from access;access.domain access.day access.pv
google.com 2020-01-02 5
google.com 2020-01-03 15
baidu.com 2020-01-01 5
1.先把每个月的指标算出来
select domain,date_format(day,'yyyy-MM') month,sum(pv) pv
from access
group by domain,date_format(day,'yyyy-MM');domain month pv
baidu.com 2020-01 5
baidu.com 2020-02 20
baidu.com 2020-03 11
google.com 2020-01 50
google.com 2020-02 15
google.com 2020-03 33
youdao.com 2020-01 8
youdao.com 2020-02 28
youdao.com 2020-03 13
2.根据上述计算结果算出每个月的指标
select
t.domain,
t.month,
t.pv,
max(t.pv) over(partition by t.domain order by t.domain,t.month rows between unbounded preceding and current row) max_pv,
sum(t.pv) over(partition by t.domain order by t.domain,t.month rows between unbounded preceding and current row) sum_pv
from
(
select domain,date_format(day,'yyyy-MM') month,sum(pv) pv
from access
group by domain,date_format(day,'yyyy-MM')
) tt.domain t.month t.pv max_pv sum_pv
baidu.com 2020-01 5 5 5
baidu.com 2020-02 20 20 25
baidu.com 2020-03 11 20 36
google.com 2020-01 50 50 50
google.com 2020-02 15 50 65
google.com 2020-03 33 50 98
youdao.com 2020-01 8 8 8
youdao.com 2020-02 28 28 36
youdao.com 2020-03 13 28 49
第二种方式采用自连接的方式实现
create table access_tmp as
select domain,date_format(day,'yyyy-MM') month,sum(pv) pv
from access
group by domain,date_format(day,'yyyy-MM');select * from access_tmpaccess_tmp.domain access_tmp.month access_tmp.pv
baidu.com 2020-01 5
baidu.com 2020-02 20
baidu.com 2020-03 11
google.com 2020-01 50
google.com 2020-02 15
google.com 2020-03 33
youdao.com 2020-01 8
youdao.com 2020-02 28
youdao.com 2020-03 13create table access_tmp2 as
select a.domain a_domain,a.month a_month,a.pv a_pv,b.domain b_domain,b.month b_month,b.pv b_pv
from
access_tmp a join access_tmp b
on a.domain = b.domainselect t.b_domain,t.b_month,t.b_pv,max(t.a_pv) max_pv,sum(t.a_pv) sum_pv
from (
select * from access_tmp2 where a_month <= b_month
) t
group by t.b_domain,t.b_month,t.b_pvt.b_domain t.b_month t.b_pv max_pv sum_pv
baidu.com 2020-01 5 5 5
baidu.com 2020-02 20 20 25
baidu.com 2020-03 11 20 36
google.com 2020-01 50 50 50
google.com 2020-02 15 50 65
google.com 2020-03 33 50 98
youdao.com 2020-01 8 8 8
youdao.com 2020-02 28 28 36
youdao.com 2020-03 13 28 49
更多推荐
Hive 部分业务数据指标计算
发布评论