CREATE TABLE
语法
①
CREATE TABLE [ IF NOT EXISTS ]
[catalog_name.][db_name.]table_name (
{ column_name data_type [ NOT NULL ]
[ COMMENT col_comment]
[ WITH ( property_name = expression [, ...] ) ]
| LIKE existing_table_name
[ { INCLUDING | EXCLUDING } PROPERTIES ]
}
[, ...]
)
[ COMMENT table_comment ]
[ WITH ( property_name = expression [, ...] ) ]
②
CREATE [EXTERNAL] TABLE [IF NOT EXISTS]
[catalog_name.][db_name.]table_name (
{ column_name data_type [ NOT NULL ]
[ COMMENT comment ]
[ WITH ( property_name = expression [, ...] ) ]
| LIKE existing_table_name
[ { INCLUDING | EXCLUDING } PROPERTIES ]
}
[, ...]
)
[COMMENT 'table_comment']
[PARTITIONED BY(col_name data_type, ....)]
[CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name, col_name, ...)] INTO num_buckets BUCKETS]
[ROW FORMAT row_format]
[STORED AS file_format]
[LOCATION 'hdfs_path']
[TBLPROPERTIES (orc_table_property = value [, ...] ) ]
③
CREATE [EXTERNAL] TABLE [IF NOT EXISTS]
[catalog_name.][db_name.]table_name (
{ column_name data_type [ NOT NULL ]
[ COMMENT comment ]
[ WITH ( property_name = expression [, ...] ) ]
| LIKE existing_table_name
[ { INCLUDING | EXCLUDING } PROPERTIES ]
}
[, ...]
)
[PARTITIONED BY(col_name data_type, ....)]
[SORT BY ([column [, column ...]])]
[COMMENT 'table_comment']
[ROW FORMAT row_format]
[STORED AS file_format]
[LOCATION 'hdfs_path']
[TBLPROPERTIES (orc_table_property = value [, ...] ) ]
限制
- session属性可以设置bucket_count,默认值为-1,表示未设置。创建分区表时,如果bucket_count为-1且建表语句中未设置buckets,则使用默认值16。
- 默认外部表存储位置/user/hive/warehouse/{schema_name}/{table_name},其中{schema_name}为建表时使用的schema,{table_name}为表名。
- 指定属性“transactional=true”可以让表支持“原子性、一致性、隔离性、持久性”写入的事务能力,但是将表定义为事务表后,无法通过设置“transactional=false”将其退化为非事务表。
transactional='true'或 '0'在执行过程中不会进行类型转换,所以这种写法会抛出异常:
Cannot convert ['true'] to boolean
Cannot convert ['0'] to boolean
- 默认不允许向托管表(表属性external = true)插入数据,如需使用该功能,可参考注意事项,添加hive自定义属性:hive.non-managed-table-writes-enabled=true。
- Mppdb有一个限制,数据库的标识符的最大长度为63,如果把标识符命名超过了最大长度,那么会被自动截取掉超出的部分,只留下最大长度的标识符。
- 跨域场景不支持建表。
描述
使用CREATE TABLE创建一个具有指定列的、新的空表。使用CREATE TABLE AS创建带数据的表。
- 使用可选参数IF NOT EXISTS,如果表已经存在则不会报错。
- WITH子句可用于在新创建的表或单列上设置属性,如表的存储位置(location)、是不是外表(external)等。
- LIKE子句用于在新表中包含来自现有表的所有列定义。可以指定多个LIKE子句,从而允许从多个表中复制列。如果指定了INCLUDING PROPERTIES,则将所有表属性复制到新表中。如果WITH子句指定的属性名称与复制的属性名称相同,则将使用WITH子句中的值。默认是EXCLUDING PROPERTIES属性,而且最多只能为一个表指定INCLUDING PROPERTIES属性。
- PARTITIONED BY能够用于指定分区的列;CLUSTERED BY能够被用于指定分桶的列;SORT BY和SORTED BY能够用于给指定的分桶列进行排序;BUCKETS能够被用于指定分桶数;EXTERNAL可用于指定创建外部表;STORED AS能被用于指定文件存储的格式;LOCATION能被用于指定在HDFS上存储的路径。
想要查看支持哪些column属性,可以运行以下命令,会显示当前对接的catalog分别支持哪些列属性。
SELECT * FROM system.metadata.column_properties;
想要查看支持哪些table属性,可以运行以下命令:
SELECT * FROM system.metadata.table_properties;
下表为catalog为hive时的查询结果。
SELECT * FROM system.metadata.table_properties where catalog_name = 'hive';
catalog_name |
property_name |
default_value |
type |
description |
---|---|---|---|---|
hive |
auto_purge |
false |
boolean |
Skip trash when table or partition is deleted |
hive |
avro_schema_url |
- |
varchar |
URI pointing to Avro schema for the table |
hive |
bucket_count |
0 |
integer |
Number of buckets |
hive |
bucketed_by |
[] |
array(varchar) |
Bucketing columns |
hive |
bucketing_version |
- |
integer |
Bucketing version |
hive |
csv_escape |
- |
varchar |
CSV escape character |
hive |
csv_quote |
- |
varchar |
CSV quote character |
hive |
csv_separator |
- |
varchar |
CSV separator character |
hive |
external_location |
- |
varchar |
File system location URI for external table |
hive |
format |
ORC |
varchar |
Hive storage format for the table. Possible values: [ORC, PARQUET, AVRO, RCBINARY, RCTEXT, SEQUENCEFILE, JSON, TEXTFILE, TEXTFILE_MULTIDELIM, CSV] |
hive |
orc_compress |
GZIP |
varchar |
Compression codec used. Possible values: [NONE, SNAPPY, LZ4, ZSTD, GZIP, ZLIB] |
hive |
orc_compress_size |
262144 |
bigint |
orc compression size |
hive |
orc_row_index_stride |
10000 |
integer |
no. of row index strides |
hive |
orc_stripe_size |
67108864 |
bigint |
orc stripe size |
hive |
orc_bloom_filter_columns |
[] |
array(varchar) |
ORC Bloom filter index columns |
hive |
orc_bloom_filter_fpp |
0.05 |
double |
ORC Bloom filter false positive probability |
hive |
partitioned_by |
[] |
array(varchar) |
Partition columns |
hive |
sorted_by |
[] |
array(varchar) |
Bucket sorting columns |
hive |
textfile_skip_footer_line_count |
- |
integer |
Number of footer lines |
hive |
textfile_skip_header_line_count |
- |
integer |
Number of header lines |
hive |
transactional |
false |
boolean |
Is transactional property enabled |
示例
- 创建一个新表orders,使用子句with指定创建表的存储格式、存储位置、以及是否为外表。
通过“auto.purge”参数可以指定涉及到数据移除操作(如DROP、DELETE、INSERT OVERWRITE、TRUNCATE TABLE)时是否清除相关数据:
- "auto.purge"='true'时,清除元数据和数据文件。
- "auto.purge"='false'时,仅清除元数据,数据文件会移入HDFS回收站。默认值为“false”,且不建议用户修改此属性,避免数据删除后无法恢复。
CREATE TABLE orders ( orderkey bigint, orderstatus varchar, totalprice double, orderdate date ) WITH (format = 'ORC', location='/user',orc_compress='ZLIB',external=true, "auto.purge"=false); -- 通过DESC FORMATTED 语句,可以查看建表的详细信息 desc formatted orders ; Describe Formatted Table ------------------------------------------------------------------------------ # col_name data_type comment orderkey bigint orderstatus varchar totalprice double orderdate date # Detailed Table Information Database: default Owner: admintest LastAccessTime: 0 Location: hdfs://hacluster/user Table Type: EXTERNAL_TABLE # Table Parameters: EXTERNAL TRUE auto.purge false orc.compress.size 262144 orc.compression.codec ZLIB orc.row.index.stride 10000 orc.stripe.size 67108864 presto_query_id 20220812_084110_00050_srknk@default@HetuEngine presto_version 1.2.0-h0.cbu.mrs.320.r1-SNAPSHOT transient_lastDdlTime 1660293670 # Storage Information SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat Compressed: No Num Buckets: -1 Bucket Columns: [] Sort Columns: [] Storage Desc Params: serialization.format 1 (1 row)
- 创建一个新表,指定Row format:
--建表时,指定表的字段分隔符为‘,’号(如果创建外表,要求数据文件中的每条记录的字段是以逗号进行分隔) CREATE TABLE student( id string,birthday string, grade int, memo string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; --建表时,指定字段分隔符为'\t',换行符为'\n' CREATE TABLE test( id int, name string , tel string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE;
- 如果表orders不存在,则创建表orders,并且增加表注释和列注释:
CREATE TABLE IF NOT EXISTS orders ( orderkey bigint, orderstatus varchar, totalprice double COMMENT 'Price in cents.', orderdate date ) COMMENT 'A table to keep track of orders.'; insert into orders values (202011181113,'online',9527,date '2020-11-11'), (202011181114,'online',666,date '2020-11-11'), (202011181115,'online',443,date '2020-11-11'), (202011181115,'offline',2896,date '2020-11-11');
- 使用表orders的列定义创建表bigger_orders:
CREATE TABLE bigger_orders ( another_orderkey bigint, LIKE orders, another_orderdate date ); SHOW CREATE TABLE bigger_orders ; Create Table --------------------------------------------------------------------- CREATE TABLE hive.default.bigger_orders ( another_orderkey bigint, orderkey bigint, orderstatus varchar, totalprice double, ordersdate date, another_orderdate date ) WITH ( external = false, format = 'ORC', location = 'hdfs://hacluster/user/hive/warehouse/bigger_orders', orc_compress = 'GZIP', orc_compress_size = 262144, orc_row_index_stride = 10000, orc_stripe_size = 67108864 ) (1 row)
- 标号① 建表示例:
CREATE EXTERNAL TABLE hetu_test (orderkey bigint, orderstatus varchar, totalprice double, orderdate date) PARTITIONED BY(ds int) SORT BY (orderkey, orderstatus) COMMENT 'test' STORED AS ORC LOCATION '/user' TBLPROPERTIES (orc_compress = 'SNAPPY', orc_compress_size = 6710422, orc_bloom_filter_columns = 'orderstatus,totalprice');
- 标号② 建表示例:
CREATE EXTERNAL TABLE hetu_test1 (orderkey bigint, orderstatus varchar, totalprice double, orderdate date) COMMENT 'test' PARTITIONED BY(ds int) CLUSTERED BY (orderkey, orderstatus) SORTED BY (orderkey, orderstatus) INTO 16 BUCKETS STORED AS ORC LOCATION '/user' TBLPROPERTIES (orc_compress = 'SNAPPY', orc_compress_size = 6710422, orc_bloom_filter_columns = 'orderstatus,totalprice');
- 标号③ 建表示例:
CREATE TABLE hetu_test2 (orderkey bigint, orderstatus varchar, totalprice double, orderdate date, ds int) COMMENT 'This table is in Hetu syntax' WITH (partitioned_by = ARRAY['ds'], bucketed_by = ARRAY['orderkey', 'orderstatus'], sorted_by = ARRAY['orderkey', 'orderstatus'], bucket_count = 16, orc_compress = 'SNAPPY', orc_compress_size = 6710422, orc_bloom_filter_columns = ARRAY['orderstatus', 'totalprice'], external = true, format = 'orc', location = '/user');
- 查看表的建表语句:
show create table hetu_test1; Create Table ------------------------------------------------------------------ CREATE TABLE hive.default.hetu_test1 ( orderkey bigint, orderstatus varchar, totalprice double, orderdate date, ds integer ) COMMENT 'test' WITH ( bucket_count = 16, bucketed_by = ARRAY['orderkey','orderstatus'], bucketing_version = 1, external_location = 'hdfs://hacluster/user', format = 'ORC', orc_bloom_filter_columns = ARRAY['orderstatus','totalprice'], orc_bloom_filter_fpp = 5E-2, orc_compress = 'SNAPPY', orc_compress_size = 6710422, orc_row_index_stride = 10000, orc_stripe_size = 67108864, partitioned_by = ARRAY['ds'], sorted_by = ARRAY['orderkey','orderstatus'] ) (1 row)
创建分区表
--创建schema CREATE SCHEMA hive.web WITH (location = 'hdfs://hacluster/user'); --创建分区表 CREATE TABLE hive.web.page_views ( view_time timestamp, user_id bigint, page_url varchar, ds date, country varchar ) WITH ( format = 'ORC', partitioned_by = ARRAY['ds', 'country'], bucketed_by = ARRAY['user_id'], bucket_count = 50 ); --插入空的分区 CALL system.create_empty_partition( schema_name => 'web', table_name => 'page_views', partition_columns => ARRAY['ds', 'country'], partition_values => ARRAY['2020-07-17', 'US']); CALL system.create_empty_partition( schema_name => 'web', table_name => 'page_views', partition_columns => ARRAY['ds', 'country'], partition_values => ARRAY['2020-07-18', 'US']); --查看分区 SELECT * FROM hive.web."page_views$partitions"; ds | country ------------|--------- 2020-07-18 | US 2020-07-17 | US --插入数据 insert into hive.web.page_views values(timestamp '2020-07-17 23:00:15',bigint '15141','www.local.com',date '2020-07-17','US' ); insert into hive.web.page_views values(timestamp '2020-07-18 23:00:15',bigint '18148','www.local.com',date '2020-07-18','US' ); --查询数据 select * from hive.web.page_views; view_time | user_id | page_url | ds | country -------------------------|---------|---------------|------------|--------- 2020-07-17 23:00:15.000 | 15141 | www.local.com | 2020-07-17 | US 2020-07-18 23:00:15.000 | 18148 | www.local.com | 2020-07-18 | US