当前位置: article > 正文

Hive进阶(1)重点_row farmat delimite fields terminated

作者：运维做开发 | 2024-07-13 20:50:38

踩

row farmat delimite fields terminated

一、Hive的复杂数据类型

前面已经讲过基本数据类型，以下是复杂基本类型
在这里插入图片描述
复杂类型分为三种,分别是数组array,键值对map,和结构体struct

array : col array<基本类型> ,下标从0开始，越界不报错，以NULL代替 
map : column map<string,string> 
struct: col struct 123
1
2
3

1.1array示例

-- 数据如下: 注意下面列之间是通过TAB来分隔的 
zhangsan 78,89,92,96 
lisi 67,75,83,94 

# 注意terminated顺序,新建数组类型 
create table if not exists arr1 ( 
name string, 
score array<string>
) 
row format delimited fields terminated by '\t' 
collection items terminated by ',';
#collection items terminated by ','是元素分隔符，这里的意思是，用','来进行array的切割

# 导入数据 
load data local inpath '/root/hivetest/arr1.csv' into table arr1;

# 查询： 
select * from arr1; 
select name,score[1] from arr1 where size(score) > 3;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

1.1.1列转行

就是把一列数据转化成多行,如下:

#原始数据： 
zhangsan 90,87,63,76 

#转化后数据 
zhangsan 90 
zhangsan 87 
zhangsan 63 
zhangsan 76
1
2
3
4
5
6
7
8

内嵌插叙：

explode：展开

列表中的每个元素生成一行

select explode(score) score from arr1;
1

lateral view:虚拟表

侧视图的意义是配合explode，一个语句生成把单行数据拆解成多行后的数据结果集。

解释:lateral view 会将explode生成的结果放到一个虚拟表中，然后这个虚拟表会和当前表
join,来达到数据聚合的目的。

结构解析:要进行聚合的虚拟表，lateral view explode(字段) 虚拟表名 as 虚拟表字段名

select name,cj from arr1 lateral view explode(score) score as cj;
1

统计每个学生的总成绩：

select name,sum(cj) as totalscore 
from arr1 
lateral view explode(score) score as cj 
group by name;
1
2
3
4

1.1.2 行转列

就是把多行数据转化成一行数据:

#原始数据： 
zhangsan 90 
zhangsan 87 
zhangsan 63 
zhangsan 76 

#转化后数据 
zhangsan 90,87,63,76
1
2
3
4
5
6
7
8

准备数据

create table arr_temp 
as
select name,cj 
from arr1 
lateral view explode(score) score as cj;
1
2
3
4
5

collect_set函数：

它们都是将分组中的某列转为一个数组返回

create table if not exists arr3( 
name string, 
score array<string> 
)
row format delimited fields terminated by ' ' 
collection items terminated by ',';
1
2
3
4
5
6

将数据写成array格式：

它们都是将分组中的某列转为一个数组返回

create table if not exists arr3( 
name string, 
score array<string> 
)
row format delimited fields terminated by ' ' 
collection items terminated by ',';
1
2
3
4
5
6

将数据写成array格式：

insert into arr3 
select name,collect_set(cj) 
from arr_temp 
group by name;
1
2
3
4

1.2 map示例

有数据如下:

zhangsan chinese:90,math:87,english:63,nature:76 
lisi chinese:60,math:30,english:78,nature:0 
wangwu chinese:89,math:25,english:81,nature:9
1
2
3

创建map类型的表

create table if not exists map1( 
name string, score map<string,int> 
)
row format delimited fields terminated by ' ' 
collection items terminated by ',' 
map keys terminated by ':';
1
2
3
4
5
6

加载数据

load data local inpath '/root/hivetest/map1.txt' into table map1;
1

Map格式数据查询

#查询数学大于35分的学生的英语和自然成绩： 
select 
m.name, 
m.score['english'] , 
m.score['nature'] 
from map1 m 
where m.score['math'] > 35;
1
2
3
4
5
6
7

1.2.1Map列转行

使用上面的数据

explode 展开数据

select explode(score) as (m_class,m_score) from map1;
1

Lateral view

Lateral View和split，explode等一起使用，它能够将一行数据拆成多行数据，并在此基础上对拆分后的数据进行聚合。

select name,m_class,m_score 
from map1 
lateral view explode(score) score as m_class,m_score;
1
2
3

1.3 struct示例

create table if not exists str2( 
uname string, 
addr struct < province:string,city:string,xian:string,dadao:string >
) 
row format delimited fields terminated by '\t' 
collection items terminated by ','; 

#导入数据： 
load data local inpath '/root/hivetest/struct.txt' into table str2; 

#查询数据： 
select uname,addr.province,addr.city,addr.xian from str2;
1
2
3
4
5
6
7
8
9
10
11
12

1.4复杂数据类型案例

1.4.1数据准备

uid uname     belong                    tax                                     addr 
1 xdd ll,lw,lg,lm wuxian:300,gongjijin:1200,shebao:300 北京,西城区,中南海 
2 lkq lg,lm,lw,ll,mm wuxian:200,gongjijin:1000,shebao:200 河北,石家庄,中山路```
1
2
3

#查询：下属个数大于4个，公积金小于1200，省份在河北的数据

create table if not exists tax(
id int, 
name string, 
belong array<string>, 
tax map<string,double>, 
addr struct<province:string,city:string,road:string> 
)
row format delimited fields terminated by ' ' 
collection items terminated by ',' 
map keys terminated by ':' stored as textfile; 

# 导入数据 
load data local inpath '/root/hivetest/tax.txt' into table tax; 

#查询：下属个数大于4个，公积金小于1200，省份在河北的数据 
select 
id, 
name, 
belong[0], 
belong[1], 
tax['wuxian'], 
tax['shebao'], 
addr.road 
from tax 
where size(belong) > 4 
and tax['gongjijin'] < 1200 
and addr.province = '河北'; 

结果: 2 lkq lg lw 2000.0 300.0 河北 石家庄
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

二、系统内置函数

2.1 函数查看

可以用下面两个命令查看Hive中的函数

-- 显示Hive中所有函数 
show functions; 

-- 查看某个函数的用法 
desc function array;
1
2
3
4
5

2.2 日期查看

因为Hive的核心功能和海量数据统计分析,而在统计分析时日期时间是一个非常重要的维度,所以日期函数在Hive使用中尤为重要.

-- 时间戳转日期 
select from_unixtime(1505456567); 
select from_unixtime(1505456567,'yyyyMMdd'); 
select from_unixtime(1505456567,'yyyy-MM-dd HH:mm:ss'); 

-- 获取当前时间戳 
select unix_timestamp();

-- 日期转时间戳 
select unix_timestamp('2017-09-15 14:23:00'); 

-- 计算时间差 
select datediff('2018-06-18','2018-11-21'); 

-- 查询当月第几天 
select dayofmonth(current_date); 

-- 月末: 
select last_day(current_date); 

--当月第1天: 
select date_sub(current_date,dayofmonth(current_date)-1); 

--下个月第1天: 
select add_months(date_sub(current_date,dayofmonth(current_date)-1),1); 

-- 当前日期 
select current_date 

-- 字符串转时间（字符串必须为：yyyy-MM-dd格式） 
select to_date('2017-01-01 12:12:12'); 

-- 日期、时间戳、字符串类型格式化输出标准时间格式： 
select date_format(current_timestamp(),'yyyy-MM-dd HH:mm:ss'); 
select date_format(current_date(),'yyyyMMdd'); 
select date_format('2017-01-01','yyyy-MM-dd HH:mm:ss');
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

2.3 字符串函数

lower--（转小写） 
select lower('ABC'); 

upper--（转大写） 
select lower('abc'); 

length--（字符串长度，字符数） 
select length('abc');

concat--（字符串拼接） 
select concat("A", 'B'); 

concat_ws --（指定分隔符） 
select concat_ws('-','a' ,'b','c'); 

substr--（求子串） 
select substr('abcde',3);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

2.4 类型转换函数

cast(value as type) -- 类型转换 
select cast('123' as int)+1;
1
2

2.5 数学函数

round --四舍五入((42.3 =>42)) 
select round(42.3); 

ceil --向上取整(42.3 =>43) 
select ceil(42.3); 

floor --向下取整(42.3 =>42) 
select floor(42.3);
1
2
3
4
5
6
7
8

2.6 判读那为空函数

nvl（expr1，expr2） 
#作用：将查询为Null值转换为指定值。 
#若expr1为Null，则返回expr2，否则返回expr1。 
select nvl(count,2);
1
2
3
4

三、窗口函数

3.1简介

窗口函数又名开窗函数，属于分析函数的一种。用于解决复杂报表统计需求的功能强大的函数。窗口函数用于计算基于组的某种聚合值，它和聚合函数的不同之处是：对于每个组返回多行，而聚合函数对于每个组只返回一行。开窗函数指定了分析函数工作的数据窗口大小，这个数据窗口大小可能会随着行的变化而变化。
在这里插入图片描述

默认的数据库的查询都是要么详细记录,要么聚合分析,如果要查询详细记录和聚合数据,必须要经过两次查询
简单的说窗口函数对每条详细记录开一个窗口,进行聚合统计的查询
默认mysql老版本没有支持,在最新的8.0版本中支持, Oracle和Hive中都支持窗口函数

3.2语法

3.2.1 over开窗
使用窗口函数之前一般要通过over()进行开窗,简单可以写成函数+over简单的写法如下:

-- 1.不使用窗口函数 
-- 查询所有明细 
select * from t_order; 
# 查询总量 
select count(*) from t_order;

 -- 2.使用窗口函数 
 select *, count(*) over() from t_order;
1
2
3
4
5
6
7
8

注意：

窗口函数是针对每一行数据的
如果over中没有参数，默认的是全部结果集

需求：查询在2018年1月份购买过的顾客购买明细及总人数

select *,count(*) over () 
from t_order 
where substring(orderdate,1,7) = '2018-01'
1
2
3

3.2.2partition by子句

在over窗口中进行分区，对某一列进行分区统计，窗口的大小就是分区的大小
需求：查看顾客的购买明细及月购买总额

select name, orderdate, cost, sum(cost) over (
partition by month(orderdate)) 
from t_order;
1
2
3

3.2.3 order by

order by子句会让输入的数据强制排序

select name, orderdate, cost, sum(cost) over (
partition by month(orderdate) 
order by orderdate) 
from t_order;
1
2
3
4

3.2.4Window子句

如果要对窗口的结果做更细粒度的划分，那么就使用window子句，常见的有下面几个：

preceding : 往前
following : 往后
current row : 当前行
unbounded : 起点，unbounded preceding 表示从前面的起点，unbounded following : 表示到后面的终点。

一般window子句都是rows开头

需求：查看顾客到目前为止的购买总额

select name, 
t_order.orderdate, 
cost, 
sum(cost) 
over (
partition by name 
order by orderdate 
rows between UNBOUNDED PRECEDING and current row 
) as allCount 
from t_order;
1
2
3
4
5
6
7
8
9
10

3.3 序列函数

3.3.1 ntile

用于将分组数据按照顺序切分成n片，返回当前切片值

例子

select name,
orderdate,
cost, 
ntile(3) over(partition by name) 
# 按照name进行分组,在分组内将数据切成3 份
from t_order;
1
2
3
4
5
6

3.3.2 lag和lead函数

lag返回当前数据行的上一行数据
lead放回当前数据行的下一行数据

需求：查询顾客上次购买的时间

select name,
orderdate,
cost, 
lag(orderdate,1,'1990-01-01') over(partition by name order by orderdate ) as time1 
from t_order;
1
2
3
4
5

取得顾客下次购买的时间

select name,
orderdate,
cost, 
lead(orderdate,1) over(partition by name order by orderdate ) as time1 
from t_order;
1
2
3
4
5

3.3.3first_value和last_value

first_value取分组内排序后，截至到当前行，第一个值
last_value 分组内排序后，截至到当前行，最后一个值

select name,
orderdate,
cost, 
first_value(orderdate) over(partition by name order by orderdate) as time1,
last_value(orderdate) over(partition by name order by orderdate) as time2 
from t_order
1
2
3
4
5
6

3.4排名函数

row_number从1开始，按照顺序，生成分组内记录的序列，row_number()的值不会存在重复，当排序的值相同时，按照表中记录的顺序进行排列
rank()生成数据项在分组中的排名，排名相等会在名词中留下空位
dense_rank()生成数据项在分组中的排名，排名相等会在名词中不会留下空位

row_number()：没有并列，相同名次依顺序排 
rank()：有并列，相同名次空位 
dense_rank()：有并列，相同名次不空位
1
2
3

准备数据多次的考试成绩

create table if not exists stu_score( 
dt string,
name string, 
score int )
row format delimited fields terminated by ','; 

load data local inpath '/opt/data/stu_score.txt' overwrite into table stu_score;
1
2
3
4
5
6
7

需求1：对每次考试按照考试成绩倒序

select * 
from stu_score 
order by dt,score desc ;
1
2
3

需求2：获取每次考试的排名情况

select dt,name,score, 
-- 没有并列，相同名次依顺序排 
row_number() over(distribute by dt sort by score desc) rn, 
-- rank()：
有并列，相同名次空位 
rank() over(distribute by dt sort by score desc) rn, 
-- dense_rank()：有并列，相同名次不空位 
dense_rank() over(distribute by dt sort by score desc) rn 
from stu_score;
1
2
3
4
5
6
7
8
9

需求3：求每次考试的前三名

select * 
from ( 
	select dt, 
		name, 
		score, 
		row_number() over (distribute by dt sort by score desc) rn 
	from stu_score ) a 
where a.rn < 4;
1
2
3
4
5
6
7
8

四、自定义函数

https://blog.csdn.net/xhzxhz12/article/details/116244713

声明：本文内容由网友自发贡献，转载请注明出处：【wpsshop】