Hive sql基本实操

编程入门行业动态更新时间:2024-10-28 13:27:39

Hive sql基本实操

#启动hdfs

#启动mysql

#启动hive

#操作基本和sql一样（这是在服务端操作）

#在server01上启动客户端

#在server02上用客户端连接server01上的客户端

#创建，t_a,t_b表

#在server01上/root/dataDir创建文件a.txt、b.txt;内容如下

#导入数据

#表里已经有数据了

#join操作

#sql语句其实是转换为map,reduce

#左半连接，只显示左表数据

#分区表，以dt(日期分区)

一

有如下数据
access.log.0804

192.168.33.3,,2017-08-04 15:30:20
192.168.33.3,,2017-08-04 15:35:20
192.168.33.4,,2017-08-04 15:30:20
192.168.33.4,,2017-08-04 16:30:20
192.168.33.5,,2017-08-04 15:40:20

access.log.0805

192.168.33.3,,2017-08-05 15:30:20
192.168.44.3,,2017-08-05 15:35:20
192.168.33.44,,2017-08-05 15:30:20
192.168.33.46,,2017-08-05 16:30:20
192.168.33.55,,2017-08-05 15:40:20

access.log.0806

192.168.133.3,,2017-08-06 15:30:20
192.168.111.3,,2017-08-06 15:35:20
192.168.34.44,,2017-08-06 15:30:20
192.168.33.46,,2017-08-06 16:30:20
192.168.33.55,,2017-08-06 15:40:20
192.168.33.46,,2017-08-06 16:30:20
192.168.33.25,,2017-08-06 15:40:20
192.168.33.36,,2017-08-06 16:30:20
192.168.33.55,,2017-08-06 15:40:20

二，

建表映射上述数据
create table t_access(ip string,url string,access_time string)
partitioned by (dt string)
row format delimited fields terminated by ',';

三，导入数据
load data local inpath '/root/hivetest/access.log.0804' into table t_access partition(dt='2017-08-04');
load data local inpath '/root/hivetest/access.log.0805' into table t_access partition(dt='2017-08-05');
load data local inpath '/root/hivetest/access.log.0806' into table t_access partition(dt='2017-08-06');

四，sql练习

-- 查看表的分区
show partitions t_access;

--删除分区
alter table t_access drop partition (dt='2017-08-04');

--删除表数据

truncate table t_access;

-- 求8月4号以后，每天，及访问者中ip地址中最大的
select dt,'',count(1),max(ip)
from t_access
where url=''
group by dt having dt>'2017-08-04';

select dt,max(url),count(1),max(ip)
from t_access
where url=''
group by dt having dt>'2017-08-04';

select dt,url,count(1),max(ip)
from t_access
where url=''
group by dt,url having dt>'2017-08-04';

select dt,url,count(1),max(ip)
from t_access
where url='' and dt>'2017-08-04'
group by dt,url;

-- 求8月4号以后，每天每个页面的总访问次数，及访问者中ip地址中最大的

select dt,url,count(1),max(ip)
from t_access
where dt>'2017-08-04'
group by dt,url;

-- 求8月4号以后，每天每个页面的总访问次数，及访问者中ip地址中最大的，且，只查询出总访问次数>2 的记录
-- 方式1：
select dt,url,count(1) as cnts,max(ip)
from t_access
where dt>'2017-08-04'
group by dt,url having cnts>2;

-- 方式2：用子查询
select dt,url,cnts,max_ip
from
(select dt,url,count(1) as cnts,max(ip) as max_ip
from t_access
where dt>'2017-08-04'
group by dt,url) tmp
where cnts>2;

#HIVE 中的复合数据类型

数组

有如下数据：
zhanlang2,wujing:wugang:longmu,2017-08-16
sanshengsanshi,liuyifei:yangyang,2017-08-20

建表映射：
create table t_movie(movie_name string,actors array<string>,first_show date)
row format delimited fields terminated by ','
collection items terminated by ':';

导入数据
load data local inpath '/root/dataDir/movie-actor.txt' into table t_movie;（汉字乱码，暂时没解决）

查看导入结果

select movie_name,actors[0],first_show from t_movie;

select movie_name,actors,first_show
from t_movie where array_contains(actors,'yangyang');

select movie_name
,size(actors) as actor_number
,first_show
from t_movie;

MAP类型

有如下数据：fm.dat
1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelababy,26

建表映射上述数据

create table t_family(id int,name string,family_members map<string,string>,age int)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';

导入数据
load data local inpath '/root/hivetest/fm.dat' into table t_family;

导入结果：

查出每个人的爸爸、姐妹
select id,name,family_members["father"] as father,family_members["sister"] as sister,age
from t_family;

以下自己练习：

-- 查出每个人有哪些亲属关系
select id,name,map_keys(family_members) as relations,age
from t_family;

-- 查出每个人的亲人名字
select id,name,map_values(family_members) as relations,age
from t_family;

-- 查出每个人的亲人数量
select id,name,size(family_members) as relations,age
from t_family;

-- 查出所有拥有兄弟的人及他的兄弟是谁
-- 方案1：一句话写完
select id,name,age,family_members['brother']
from t_family where array_contains(map_keys(family_members),'brother');

-- 方案2：子查询
select id,name,age,family_members['brother']
from
(select id,name,age,map_keys(family_members) as relations,family_members
from t_family) tmp
where array_contains(relations,'brother');

struct类型

假如有以下数据：
1,zhangsan,18:male:深圳
2,lisi,28:female:北京
3,wangwu,38:male:广州
4,赵六,26:female:上海
5,钱琪,35:male:杭州
6,王八,48:female:南京

-- 建表映射上述数据

drop table if exists t_user;
create table t_user(id int,name string,info struct<age:int,sex:string,addr:string>)
row format delimited fields terminated by ','
collection items terminated by ':';

-- 导入数据
load data local inpath '/root/hivetest/user.dat' into table t_user;

-- 查询每个人的id name和地址
select id,name,info.addr
from t_user;

更多推荐

Hive sql基本实操

本文发布于:2024-03-07 07:02:24，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1717142.html