Hive sql基本实操"/>
Hive sql基本实操
#启动hdfs
#启动mysql
#启动hive
#操作基本和sql一样(这是在服务端操作)
#在server01上启动客户端
#在server02上用客户端连接server01上的客户端
#创建,t_a,t_b表
#在server01上/root/dataDir创建文件a.txt、b.txt;内容如下
#
#导入数据
#表里已经有数据了
#join操作
#sql语句其实是转换为map,reduce
#左半连接,只显示左表数据
#分区表,以dt(日期分区)
一
有如下数据
access.log.0804
192.168.33.3,,2017-08-04 15:30:20
192.168.33.3,,2017-08-04 15:35:20
192.168.33.4,,2017-08-04 15:30:20
192.168.33.4,,2017-08-04 16:30:20
192.168.33.5,,2017-08-04 15:40:20
access.log.0805
192.168.33.3,,2017-08-05 15:30:20
192.168.44.3,,2017-08-05 15:35:20
192.168.33.44,,2017-08-05 15:30:20
192.168.33.46,,2017-08-05 16:30:20
192.168.33.55,,2017-08-05 15:40:20
access.log.0806
192.168.133.3,,2017-08-06 15:30:20
192.168.111.3,,2017-08-06 15:35:20
192.168.34.44,,2017-08-06 15:30:20
192.168.33.46,,2017-08-06 16:30:20
192.168.33.55,,2017-08-06 15:40:20
192.168.33.46,,2017-08-06 16:30:20
192.168.33.25,,2017-08-06 15:40:20
192.168.33.36,,2017-08-06 16:30:20
192.168.33.55,,2017-08-06 15:40:20
二,
建表映射上述数据
create table t_access(ip string,url string,access_time string)
partitioned by (dt string)
row format delimited fields terminated by ',';
三,导入数据
load data local inpath '/root/hivetest/access.log.0804' into table t_access partition(dt='2017-08-04');
load data local inpath '/root/hivetest/access.log.0805' into table t_access partition(dt='2017-08-05');
load data local inpath '/root/hivetest/access.log.0806' into table t_access partition(dt='2017-08-06');
四,sql练习
-- 查看表的分区
show partitions t_access;
--删除分区
alter table t_access drop partition (dt='2017-08-04');
--删除表数据
truncate table t_access;
-- 求8月4号以后,每天,及访问者中ip地址中最大的
select dt,'',count(1),max(ip)
from t_access
where url=''
group by dt having dt>'2017-08-04';
select dt,max(url),count(1),max(ip)
from t_access
where url=''
group by dt having dt>'2017-08-04';
select dt,url,count(1),max(ip)
from t_access
where url=''
group by dt,url having dt>'2017-08-04';
select dt,url,count(1),max(ip)
from t_access
where url='' and dt>'2017-08-04'
group by dt,url;
-- 求8月4号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的
select dt,url,count(1),max(ip)
from t_access
where dt>'2017-08-04'
group by dt,url;
-- 求8月4号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的,且,只查询出总访问次数>2 的记录
-- 方式1:
select dt,url,count(1) as cnts,max(ip)
from t_access
where dt>'2017-08-04'
group by dt,url having cnts>2;
-- 方式2:用子查询
select dt,url,cnts,max_ip
from
(select dt,url,count(1) as cnts,max(ip) as max_ip
from t_access
where dt>'2017-08-04'
group by dt,url) tmp
where cnts>2;
#HIVE 中的复合数据类型
数组
有如下数据:
zhanlang2,wujing:wugang:longmu,2017-08-16
sanshengsanshi,liuyifei:yangyang,2017-08-20
建表映射:
create table t_movie(movie_name string,actors array<string>,first_show date)
row format delimited fields terminated by ','
collection items terminated by ':';
导入数据
load data local inpath '/root/dataDir/movie-actor.txt' into table t_movie;(汉字乱码,暂时没解决)
查看导入结果
select movie_name,actors[0],first_show from t_movie;
select movie_name,actors,first_show
from t_movie where array_contains(actors,'yangyang');
select movie_name
,size(actors) as actor_number
,first_show
from t_movie;
MAP类型
有如下数据:fm.dat
1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelababy,26
建表映射上述数据
create table t_family(id int,name string,family_members map<string,string>,age int)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';
导入数据
load data local inpath '/root/hivetest/fm.dat' into table t_family;
导入结果:
查出每个人的 爸爸、姐妹
select id,name,family_members["father"] as father,family_members["sister"] as sister,age
from t_family;
以下自己练习:
-- 查出每个人有哪些亲属关系
select id,name,map_keys(family_members) as relations,age
from t_family;
-- 查出每个人的亲人名字
select id,name,map_values(family_members) as relations,age
from t_family;
-- 查出每个人的亲人数量
select id,name,size(family_members) as relations,age
from t_family;
-- 查出所有拥有兄弟的人及他的兄弟是谁
-- 方案1:一句话写完
select id,name,age,family_members['brother']
from t_family where array_contains(map_keys(family_members),'brother');
-- 方案2:子查询
select id,name,age,family_members['brother']
from
(select id,name,age,map_keys(family_members) as relations,family_members
from t_family) tmp
where array_contains(relations,'brother');
struct类型
假如有以下数据:
1,zhangsan,18:male:深圳
2,lisi,28:female:北京
3,wangwu,38:male:广州
4,赵六,26:female:上海
5,钱琪,35:male:杭州
6,王八,48:female:南京
-- 建表映射上述数据
drop table if exists t_user;
create table t_user(id int,name string,info struct<age:int,sex:string,addr:string>)
row format delimited fields terminated by ','
collection items terminated by ':';
-- 导入数据
load data local inpath '/root/hivetest/user.dat' into table t_user;
-- 查询每个人的id name和地址
select id,name,info.addr
from t_user;
更多推荐
Hive sql基本实操
发布评论