Hive sql基本实操

编程入门 行业动态 更新时间:2024-10-28 13:27:39

<a href=https://www.elefans.com/category/jswz/34/1769093.html style=Hive sql基本实操"/>

Hive sql基本实操

#启动hdfs

#启动mysql

 

#启动hive

#操作基本和sql一样(这是在服务端操作)

 

#在server01上启动客户端

#在server02上用客户端连接server01上的客户端

 #创建,t_a,t_b表

#在server01上/root/dataDir创建文件a.txt、b.txt;内容如下

#

#导入数据

#表里已经有数据了

#join操作

#sql语句其实是转换为map,reduce

#左半连接,只显示左表数据

#分区表,以dt(日期分区)

有如下数据
access.log.0804

192.168.33.3,,2017-08-04 15:30:20
192.168.33.3,,2017-08-04 15:35:20
192.168.33.4,,2017-08-04 15:30:20
192.168.33.4,,2017-08-04 16:30:20
192.168.33.5,,2017-08-04 15:40:20

access.log.0805

192.168.33.3,,2017-08-05 15:30:20
192.168.44.3,,2017-08-05 15:35:20
192.168.33.44,,2017-08-05 15:30:20
192.168.33.46,,2017-08-05 16:30:20
192.168.33.55,,2017-08-05 15:40:20

access.log.0806

192.168.133.3,,2017-08-06 15:30:20
192.168.111.3,,2017-08-06 15:35:20
192.168.34.44,,2017-08-06 15:30:20
192.168.33.46,,2017-08-06 16:30:20
192.168.33.55,,2017-08-06 15:40:20
192.168.33.46,,2017-08-06 16:30:20
192.168.33.25,,2017-08-06 15:40:20
192.168.33.36,,2017-08-06 16:30:20
192.168.33.55,,2017-08-06 15:40:20

二,

建表映射上述数据
create table t_access(ip string,url string,access_time string)
partitioned by (dt string)
row format delimited fields terminated by ',';

三,导入数据
load data local inpath '/root/hivetest/access.log.0804' into table t_access partition(dt='2017-08-04');
load data local inpath '/root/hivetest/access.log.0805' into table t_access partition(dt='2017-08-05');
load data local inpath '/root/hivetest/access.log.0806' into table t_access partition(dt='2017-08-06');

四,sql练习

-- 查看表的分区
show partitions t_access;

--删除分区
alter table t_access drop partition (dt='2017-08-04');

--删除表数据

truncate table t_access;

-- 求8月4号以后,每天,及访问者中ip地址中最大的
select dt,'',count(1),max(ip)
from t_access
where url=''
group by dt having dt>'2017-08-04';


select dt,max(url),count(1),max(ip)
from t_access
where url=''
group by dt having dt>'2017-08-04';


select dt,url,count(1),max(ip)
from t_access
where url=''
group by dt,url having dt>'2017-08-04';

select dt,url,count(1),max(ip)
from t_access
where url='' and dt>'2017-08-04'
group by dt,url;


-- 求8月4号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的

select dt,url,count(1),max(ip)
from t_access
where dt>'2017-08-04'
group by dt,url;

-- 求8月4号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的,且,只查询出总访问次数>2 的记录
-- 方式1:
select dt,url,count(1) as cnts,max(ip)
from t_access
where dt>'2017-08-04'
group by dt,url having cnts>2;


-- 方式2:用子查询
select dt,url,cnts,max_ip
from
(select dt,url,count(1) as cnts,max(ip) as max_ip
from t_access
where dt>'2017-08-04'
group by dt,url) tmp
where cnts>2;

#HIVE 中的复合数据类型

数组


有如下数据:
zhanlang2,wujing:wugang:longmu,2017-08-16
sanshengsanshi,liuyifei:yangyang,2017-08-20

 

建表映射:
create table t_movie(movie_name string,actors array<string>,first_show date)
row format delimited fields terminated by ','
collection items terminated by ':';

导入数据
load data local inpath '/root/dataDir/movie-actor.txt' into table t_movie;(汉字乱码,暂时没解决)

查看导入结果

select movie_name,actors[0],first_show from t_movie;

select movie_name,actors,first_show
from t_movie where array_contains(actors,'yangyang');

select movie_name
,size(actors) as actor_number
,first_show
from t_movie;

MAP类型

有如下数据:fm.dat
1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelababy,26

 

建表映射上述数据

create table t_family(id int,name string,family_members map<string,string>,age int)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';

导入数据
load data local inpath '/root/hivetest/fm.dat' into table t_family;

 导入结果:

查出每个人的 爸爸、姐妹
select id,name,family_members["father"] as father,family_members["sister"] as sister,age
from t_family;

以下自己练习:

-- 查出每个人有哪些亲属关系
select id,name,map_keys(family_members) as relations,age
from  t_family;

-- 查出每个人的亲人名字
select id,name,map_values(family_members) as relations,age
from  t_family;

-- 查出每个人的亲人数量
select id,name,size(family_members) as relations,age
from  t_family;

-- 查出所有拥有兄弟的人及他的兄弟是谁
-- 方案1:一句话写完
select id,name,age,family_members['brother']
from t_family  where array_contains(map_keys(family_members),'brother');


-- 方案2:子查询
select id,name,age,family_members['brother']
from
(select id,name,age,map_keys(family_members) as relations,family_members 
from t_family) tmp 
where array_contains(relations,'brother');
 

struct类型

假如有以下数据:
1,zhangsan,18:male:深圳
2,lisi,28:female:北京
3,wangwu,38:male:广州
4,赵六,26:female:上海
5,钱琪,35:male:杭州
6,王八,48:female:南京
 

-- 建表映射上述数据

drop table if exists t_user;
create table t_user(id int,name string,info struct<age:int,sex:string,addr:string>)
row format delimited fields terminated by ','
collection items terminated by ':';

-- 导入数据
load data local inpath '/root/hivetest/user.dat' into table t_user;

-- 查询每个人的id name和地址
select id,name,info.addr
from t_user;

更多推荐

Hive sql基本实操

本文发布于:2024-03-07 07:02:24,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1717142.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:Hive   sql

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!