更新时间:2024-12-31 GMT+08:00

GROUP BY

GROUP BY

GROUP BY将SELECT语句的输出行划分成包含匹配值的分组。简单的GROUP BY可以包含由输入列组成的任何表达式,也可以是按位置选择输出列的序号。

以下查询是等效的:

SELECT count(*), nationkey FROM customer GROUP BY 2;
SELECT count(*), nationkey FROM customer GROUP BY nationkey;

GROUP BY可以按未出现在SELECT语句输出中的输入列名对输出进行分组。

例如:

SELECT count(*) FROM customer GROUP BY mktsegment;
GROUPING SETS

可以指定多个列进行分组,结果列中不属于分组列的将被设置为NUll。具有复杂分组语法(GROUPING SETS、CUBE或ROLLUP)的查询只从基础数据源读取一次,而使用UNION ALL的查询将读取基础数据三次。这就是当数据源不具有确定性时,使用UNION ALL的查询可能会产生不一致的结果的原因。

--创建一个航运表
create table shipping(origin_state varchar(25),origin_zip integer,destination_state varchar(25) ,destination_zip integer,package_weight integer);

--插入数据
insert into shipping values ('California',94131,'New Jersey',8648,13),
('California',94131,'New Jersey',8540,42),
('California',90210,'Connecticut',6927,1337),
('California',94131,'Colorado',80302,5),
('New York',10002,'New Jersey',8540,3),
('New Jersey',7081,'Connecticut',6708,225);

--执行查询Grouping sets
SELECT
	origin_state,
	origin_zip,
	destination_state,
	sum( package_weight ) 
FROM shipping 
GROUP BY GROUPING SETS (
		( origin_state ),
	( origin_state, origin_zip ),
	( destination_state ));
--这个的查询在逻辑上等同于多个分组查询的union all
SELECT origin_state, NULL,NULL,sum( package_weight ) FROM shipping GROUP BY origin_state UNION ALL  SELECT origin_state,origin_zip,NULL,sum( package_weight ) FROM shipping GROUP BY origin_state,origin_zip UNION ALL  SELECT NULL,NULL,destination_state,sum( package_weight ) FROM  shipping GROUP BY  destination_state;
--结果
 origin_state | origin_zip | destination_state | _col3 
--------------|------------|-------------------|-------
 New Jersey   |       NULL | NULL              |   225 
 California   |      94131 | NULL              |    60 
 California   |       NULL | NULL              |  1397 
 New York     |      10002 | NULL              |     3 
 NULL         |       NULL | New Jersey        |    58 
 NULL         |       NULL | Connecticut       |  1562 
 California   |      90210 | NULL              |  1337 
 New York     |       NULL | NULL              |     3 
 NULL         |       NULL | Colorado          |     5 
 New Jersey   |       7081 | NULL              |   225 
(10 rows)

CUBE

为给定的列生成所有可能的分组,比如 (origin_state, destination_state) 的可能分组为:(origin_state, destination_state),(origin_state),(destination_state),()。

SELECT
	origin_state,
	destination_state,
	sum( package_weight ) 
FROM
	shipping 
GROUP BY
	CUBE ( origin_state, destination_state );
--等同于 
SELECT
origin_state,
destination_state,
sum( package_weight ) 
FROM
	shipping 
GROUP BY
	GROUPING SETS (
		( origin_state, destination_state ),
		( origin_state ),
	( destination_state ),
	());

ROLLUP

为给定的列集生成部分可能的分类汇总:

SELECT
	origin_state,
	origin_zip,
	sum( package_weight ) 
FROM
	shipping 
GROUP BY
	ROLLUP ( origin_state, origin_zip );
--等同于
SELECT
origin_state,
origin_zip,
sum( package_weight ) 
FROM
	shipping 
GROUP BY
	GROUPING SETS ((origin_state,origin_zip ),( origin_state ),());

Group by 子句目前不支持使用列的别名,例如:

select count(userid) as num ,dept as aaa from salary group by aaa having sum(sal)>2000;

报错如下:

Query 20210630_084610_00018_wc8n9@default@HetuEngine failed: line 1:63: Column 'aaa' cannot be resolved