赞
踩
前面已经讲过基本数据类型,以下是复杂基本类型
复杂类型分为三种,分别是 数组array,键值对map,和结构体struct
array : col array<基本类型> ,下标从0开始,越界不报错,以NULL代替
map : column map<string,string>
struct: col struct 123
-- 数据如下: 注意下面列之间是通过TAB来分隔的 zhangsan 78,89,92,96 lisi 67,75,83,94 # 注意terminated顺序,新建数组类型 create table if not exists arr1 ( name string, score array<string> ) row format delimited fields terminated by '\t' collection items terminated by ','; #collection items terminated by ','是元素分隔符,这里的意思是,用','来进行array的切割 # 导入数据 load data local inpath '/root/hivetest/arr1.csv' into table arr1; # 查询: select * from arr1; select name,score[1] from arr1 where size(score) > 3;
1.1.1列转行
就是把一列数据转化成多行,如下:
#原始数据:
zhangsan 90,87,63,76
#转化后数据
zhangsan 90
zhangsan 87
zhangsan 63
zhangsan 76
内嵌插叙:
列表中的每个元素生成一行
select explode(score) score from arr1;
侧视图的意义是配合explode,一个语句生成把单行数据拆解成多行后的数据结果集。
解释:lateral view 会将explode生成的结果放到一个虚拟表中,然后这个虚拟表会和当前表
join,来达到数据聚合的目的。
结构解析:要进行聚合的虚拟表,lateral view explode(字段) 虚拟表名 as 虚拟表字段名
select name,cj from arr1 lateral view explode(score) score as cj;
统计每个学生的总成绩:
select name,sum(cj) as totalscore
from arr1
lateral view explode(score) score as cj
group by name;
1.1.2 行转列
就是把多行数据转化成一行数据:
#原始数据:
zhangsan 90
zhangsan 87
zhangsan 63
zhangsan 76
#转化后数据
zhangsan 90,87,63,76
create table arr_temp
as
select name,cj
from arr1
lateral view explode(score) score as cj;
它们都是将分组中的某列转为一个数组返回
create table if not exists arr3(
name string,
score array<string>
)
row format delimited fields terminated by ' '
collection items terminated by ',';
它们都是将分组中的某列转为一个数组返回
create table if not exists arr3(
name string,
score array<string>
)
row format delimited fields terminated by ' '
collection items terminated by ',';
insert into arr3
select name,collect_set(cj)
from arr_temp
group by name;
有数据如下:
zhangsan chinese:90,math:87,english:63,nature:76
lisi chinese:60,math:30,english:78,nature:0
wangwu chinese:89,math:25,english:81,nature:9
创建map类型的表
create table if not exists map1(
name string, score map<string,int>
)
row format delimited fields terminated by ' '
collection items terminated by ','
map keys terminated by ':';
加载数据
load data local inpath '/root/hivetest/map1.txt' into table map1;
Map格式数据查询
#查询数学大于35分的学生的英语和自然成绩:
select
m.name,
m.score['english'] ,
m.score['nature']
from map1 m
where m.score['math'] > 35;
1.2.1Map列转行
使用上面的数据
explode 展开数据
select explode(score) as (m_class,m_score) from map1;
Lateral view
Lateral View和split,explode等一起使用,它能够将一行数据拆成多行数据,并在此基础上对拆分后的数据进行聚合。
select name,m_class,m_score
from map1
lateral view explode(score) score as m_class,m_score;
create table if not exists str2(
uname string,
addr struct < province:string,city:string,xian:string,dadao:string >
)
row format delimited fields terminated by '\t'
collection items terminated by ',';
#导入数据:
load data local inpath '/root/hivetest/struct.txt' into table str2;
#查询数据:
select uname,addr.province,addr.city,addr.xian from str2;
1.4.1数据准备
uid uname belong tax addr
1 xdd ll,lw,lg,lm wuxian:300,gongjijin:1200,shebao:300 北京,西城区,中南海
2 lkq lg,lm,lw,ll,mm wuxian:200,gongjijin:1000,shebao:200 河北,石家庄,中山路```
#查询:下属个数大于4个,公积金小于1200,省份在河北的数据 create table if not exists tax( id int, name string, belong array<string>, tax map<string,double>, addr struct<province:string,city:string,road:string> ) row format delimited fields terminated by ' ' collection items terminated by ',' map keys terminated by ':' stored as textfile; # 导入数据 load data local inpath '/root/hivetest/tax.txt' into table tax; #查询:下属个数大于4个,公积金小于1200,省份在河北的数据 select id, name, belong[0], belong[1], tax['wuxian'], tax['shebao'], addr.road from tax where size(belong) > 4 and tax['gongjijin'] < 1200 and addr.province = '河北'; 结果: 2 lkq lg lw 2000.0 300.0 河北 石家庄
可以用下面两个命令查看Hive中的函数
-- 显示Hive中所有函数
show functions;
-- 查看某个函数的用法
desc function array;
因为Hive的核心功能和海量数据统计分析,而在统计分析时日期时间是一个非常重要的维度,所以日期函数在Hive使用中尤为重要.
-- 时间戳转日期 select from_unixtime(1505456567); select from_unixtime(1505456567,'yyyyMMdd'); select from_unixtime(1505456567,'yyyy-MM-dd HH:mm:ss'); -- 获取当前时间戳 select unix_timestamp(); -- 日期转时间戳 select unix_timestamp('2017-09-15 14:23:00'); -- 计算时间差 select datediff('2018-06-18','2018-11-21'); -- 查询当月第几天 select dayofmonth(current_date); -- 月末: select last_day(current_date); --当月第1天: select date_sub(current_date,dayofmonth(current_date)-1); --下个月第1天: select add_months(date_sub(current_date,dayofmonth(current_date)-1),1); -- 当前日期 select current_date -- 字符串转时间(字符串必须为:yyyy-MM-dd格式) select to_date('2017-01-01 12:12:12'); -- 日期、时间戳、字符串类型格式化输出标准时间格式: select date_format(current_timestamp(),'yyyy-MM-dd HH:mm:ss'); select date_format(current_date(),'yyyyMMdd'); select date_format('2017-01-01','yyyy-MM-dd HH:mm:ss');
lower--(转小写) select lower('ABC'); upper--(转大写) select lower('abc'); length--(字符串长度,字符数) select length('abc'); concat--(字符串拼接) select concat("A", 'B'); concat_ws --(指定分隔符) select concat_ws('-','a' ,'b','c'); substr--(求子串) select substr('abcde',3);
cast(value as type) -- 类型转换
select cast('123' as int)+1;
round --四舍五入((42.3 =>42))
select round(42.3);
ceil --向上取整(42.3 =>43)
select ceil(42.3);
floor --向下取整(42.3 =>42)
select floor(42.3);
nvl(expr1,expr2)
#作用:将查询为Null值转换为指定值。
#若expr1为Null,则返回expr2,否则返回expr1。
select nvl(count,2);
窗口函数又名开窗函数,属于分析函数的一种。用于解决复杂报表统计需求的功能强大的函数。窗口函数用于计算基于组的某种聚合值,它和聚合函数的不同之处是:对于每个组返回多行,而聚合函数对于每个组只返回一行。 开窗函数指定了分析函数工作的数据窗口大小,这个数据窗口大小可能会随着行的变化而变化。
3.2.1 over开窗
使用窗口函数之前一般要通过over()进行开窗,简单可以写成函数+over简单的写法如下:
-- 1.不使用窗口函数
-- 查询所有明细
select * from t_order;
# 查询总量
select count(*) from t_order;
-- 2.使用窗口函数
select *, count(*) over() from t_order;
注意:
需求:查询在2018年1月份购买过的顾客购买明细及总人数
select *,count(*) over ()
from t_order
where substring(orderdate,1,7) = '2018-01'
3.2.2partition by子句
在over窗口中进行分区,对某一列进行分区统计,窗口的大小就是分区的大小
需求:查看顾客的购买明细及月购买总额
select name, orderdate, cost, sum(cost) over (
partition by month(orderdate))
from t_order;
3.2.3 order by
order by子句会让输入的数据强制排序
select name, orderdate, cost, sum(cost) over (
partition by month(orderdate)
order by orderdate)
from t_order;
3.2.4Window子句
如果要对窗口的结果做更细粒度的划分,那么就使用window子句,常见的有下面几个:
需求:查看顾客到目前为止的购买总额
select name,
t_order.orderdate,
cost,
sum(cost)
over (
partition by name
order by orderdate
rows between UNBOUNDED PRECEDING and current row
) as allCount
from t_order;
3.3.1 ntile
用于将分组数据按照顺序切分成n片,返回当前切片值
例子
select name,
orderdate,
cost,
ntile(3) over(partition by name)
# 按照name进行分组,在分组内将数据切成3 份
from t_order;
3.3.2 lag和lead函数
需求:查询顾客上次购买的时间
select name,
orderdate,
cost,
lag(orderdate,1,'1990-01-01') over(partition by name order by orderdate ) as time1
from t_order;
取得顾客下次购买的时间
select name,
orderdate,
cost,
lead(orderdate,1) over(partition by name order by orderdate ) as time1
from t_order;
3.3.3first_value和last_value
select name,
orderdate,
cost,
first_value(orderdate) over(partition by name order by orderdate) as time1,
last_value(orderdate) over(partition by name order by orderdate) as time2
from t_order
row_number():没有并列,相同名次依顺序排
rank():有并列,相同名次空位
dense_rank():有并列,相同名次不空位
准备数据多次的考试成绩
create table if not exists stu_score(
dt string,
name string,
score int )
row format delimited fields terminated by ',';
load data local inpath '/opt/data/stu_score.txt' overwrite into table stu_score;
需求1:对每次考试按照考试成绩倒序
select *
from stu_score
order by dt,score desc ;
需求2:获取每次考试的排名情况
select dt,name,score,
-- 没有并列,相同名次依顺序排
row_number() over(distribute by dt sort by score desc) rn,
-- rank():
有并列,相同名次空位
rank() over(distribute by dt sort by score desc) rn,
-- dense_rank():有并列,相同名次不空位
dense_rank() over(distribute by dt sort by score desc) rn
from stu_score;
需求3:求每次考试的前三名
select *
from (
select dt,
name,
score,
row_number() over (distribute by dt sort by score desc) rn
from stu_score ) a
where a.rn < 4;
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。