更新时间:2024-12-11 GMT+08:00

CREATE TABLE

语法

CREATE TABLE [ IF NOT EXISTS ]

[catalog_name.][db_name.]table_name (

{ column_name data_type [ NOT NULL ]

[ COMMENT col_comment]

[ WITH ( property_name = expression [, ...] ) ]

| LIKE existing_table_name

[ { INCLUDING | EXCLUDING } PROPERTIES ]

}

[, ...]

)

[ COMMENT table_comment ]

[ WITH ( property_name = expression [, ...] ) ]

CREATE [EXTERNAL] TABLE [IF NOT EXISTS]

[catalog_name.][db_name.]table_name (

{ column_name data_type [ NOT NULL ]

[ COMMENT comment ]

[ WITH ( property_name = expression [, ...] ) ]

| LIKE existing_table_name

[ { INCLUDING | EXCLUDING } PROPERTIES ]

}

[, ...]

)

[COMMENT 'table_comment']

[PARTITIONED BY(col_name data_type, ....)]

[CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name, col_name, ...)] INTO num_buckets BUCKETS]

[ROW FORMAT row_format]

[STORED AS file_format]

[LOCATION 'hdfs_path']

[TBLPROPERTIES (orc_table_property = value [, ...] ) ]

CREATE [EXTERNAL] TABLE [IF NOT EXISTS]

[catalog_name.][db_name.]table_name (

{ column_name data_type [ NOT NULL ]

[ COMMENT comment ]

[ WITH ( property_name = expression [, ...] ) ]

| LIKE existing_table_name

[ { INCLUDING | EXCLUDING } PROPERTIES ]

}

[, ...]

)

[PARTITIONED BY(col_name data_type, ....)]

[SORT BY ([column [, column ...]])]

[COMMENT 'table_comment']

[ROW FORMAT row_format]

[STORED AS file_format]

[LOCATION 'hdfs_path']

[TBLPROPERTIES (orc_table_property = value [, ...] ) ]

限制

  • session属性可以设置bucket_count,默认值为-1,表示未设置。创建分区表时,如果bucket_count为-1且建表语句中未设置buckets,则使用默认值16。
  • 默认外部表存储位置/user/hive/warehouse/{schema_name}/{table_name},其中{schema_name}为建表时使用的schema,{table_name}为表名。
  • 指定属性“transactional=true”可以让表支持“原子性、一致性、隔离性、持久性”写入的事务能力,但是将表定义为事务表后,无法通过设置“transactional=false”将其退化为非事务表。

    transactional='true'或 '0'在执行过程中不会进行类型转换,所以这种写法会抛出异常:

    Cannot convert ['true'] to boolean

    Cannot convert ['0'] to boolean

  • 默认不允许向托管表(表属性external = true)插入数据,如需使用该功能,可参考注意事项,添加hive自定义属性:hive.non-managed-table-writes-enabled=true。
  • Mppdb有一个限制,数据库的标识符的最大长度为63,如果把标识符命名超过了最大长度,那么会被自动截取掉超出的部分,只留下最大长度的标识符。
  • 跨域场景不支持建表。

描述

使用CREATE TABLE创建一个具有指定列的、新的空表。使用CREATE TABLE AS创建带数据的表。

  • 使用可选参数IF NOT EXISTS,如果表已经存在则不会报错。
  • WITH子句可用于在新创建的表或单列上设置属性,如表的存储位置(location)、是不是外表(external)等。
  • LIKE子句用于在新表中包含来自现有表的所有列定义。可以指定多个LIKE子句,从而允许从多个表中复制列。如果指定了INCLUDING PROPERTIES,则将所有表属性复制到新表中。如果WITH子句指定的属性名称与复制的属性名称相同,则将使用WITH子句中的值。默认是EXCLUDING PROPERTIES属性,而且最多只能为一个表指定INCLUDING PROPERTIES属性。
  • PARTITIONED BY能够用于指定分区的列;CLUSTERED BY能够被用于指定分桶的列;SORT BY和SORTED BY能够用于给指定的分桶列进行排序;BUCKETS能够被用于指定分桶数;EXTERNAL可用于指定创建外部表;STORED AS能被用于指定文件存储的格式;LOCATION能被用于指定在HDFS上存储的路径。

想要查看支持哪些column属性,可以运行以下命令,会显示当前对接的catalog分别支持哪些列属性。

SELECT * FROM system.metadata.column_properties;

想要查看支持哪些table属性,可以运行以下命令:

SELECT * FROM system.metadata.table_properties;

下表为catalog为hive时的查询结果。

SELECT * FROM system.metadata.table_properties where catalog_name = 'hive'; 

catalog_name

property_name

default_value

type

description

hive

auto_purge

false

boolean

Skip trash when table or partition is deleted

hive

avro_schema_url

-

varchar

URI pointing to Avro schema for the table

hive

bucket_count

0

integer

Number of buckets

hive

bucketed_by

[]

array(varchar)

Bucketing columns

hive

bucketing_version

-

integer

Bucketing version

hive

csv_escape

-

varchar

CSV escape character

hive

csv_quote

-

varchar

CSV quote character

hive

csv_separator

-

varchar

CSV separator character

hive

external_location

-

varchar

File system location URI for external table

hive

format

ORC

varchar

Hive storage format for the table. Possible values: [ORC, PARQUET, AVRO, RCBINARY, RCTEXT, SEQUENCEFILE, JSON, TEXTFILE, TEXTFILE_MULTIDELIM, CSV]

hive

orc_compress

GZIP

varchar

Compression codec used. Possible values: [NONE, SNAPPY, LZ4, ZSTD, GZIP, ZLIB]

hive

orc_compress_size

262144

bigint

orc compression size

hive

orc_row_index_stride

10000

integer

no. of row index strides

hive

orc_stripe_size

67108864

bigint

orc stripe size

hive

orc_bloom_filter_columns

[]

array(varchar)

ORC Bloom filter index columns

hive

orc_bloom_filter_fpp

0.05

double

ORC Bloom filter false positive probability

hive

partitioned_by

[]

array(varchar)

Partition columns

hive

sorted_by

[]

array(varchar)

Bucket sorting columns

hive

textfile_skip_footer_line_count

-

integer

Number of footer lines

hive

textfile_skip_header_line_count

-

integer

Number of header lines

hive

transactional

false

boolean

Is transactional property enabled

示例

  • 创建一个新表orders,使用子句with指定创建表的存储格式、存储位置、以及是否为外表。

    通过“auto.purge”参数可以指定涉及到数据移除操作(如DROP、DELETE、INSERT OVERWRITE、TRUNCATE TABLE)时是否清除相关数据:

    • "auto.purge"='true'时,清除元数据和数据文件。
    • "auto.purge"='false'时,仅清除元数据,数据文件会移入HDFS回收站。默认值为“false”,且不建议用户修改此属性,避免数据删除后无法恢复。
    CREATE TABLE orders (
    orderkey bigint,
    orderstatus varchar,
    totalprice double,
    orderdate date
    )
    WITH (format = 'ORC', location='/user',orc_compress='ZLIB',external=true, "auto.purge"=false);
    
    -- 通过DESC FORMATTED 语句,可以查看建表的详细信息
    desc formatted  orders ;
                               Describe Formatted Table                           
    ------------------------------------------------------------------------------
     # col_name      data_type      comment                                       
     orderkey      bigint                                                         
     orderstatus      varchar                                                     
     totalprice      double                                                       
     orderdate      date                                                          
    
     # Detailed Table Information                                                 
     Database:                   default                                          
     Owner:                      admintest                                        
     LastAccessTime:             0                                                
     Location:                   hdfs://hacluster/user                            
     Table Type:                 EXTERNAL_TABLE                                   
    
     # Table Parameters:                                                          
            EXTERNAL                TRUE                                                     
            auto.purge              false                                                     
            orc.compress.size       262144                                                   
            orc.compression.codec   ZLIB                                                    
            orc.row.index.stride    10000                                                    
            orc.stripe.size         67108864                                                 
            presto_query_id         20220812_084110_00050_srknk@default@HetuEngine           
            presto_version          1.2.0-h0.cbu.mrs.320.r1-SNAPSHOT                         
            transient_lastDdlTime   1660293670                                              
    
     # Storage Information                                                        
     SerDe Library:              org.apache.hadoop.hive.ql.io.orc.OrcSerde        
     InputFormat:                org.apache.hadoop.hive.ql.io.orc.OrcInputFormat  
     OutputFormat:               org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat 
     Compressed:                 No                                               
     Num Buckets:                -1                                               
     Bucket Columns:             []                                               
     Sort Columns:               []                                               
     Storage Desc Params:                                                         
            serialization.format    1                                                        
    (1 row)
  • 创建一个新表,指定Row format:
    --建表时,指定表的字段分隔符为‘,’号(如果创建外表,要求数据文件中的每条记录的字段是以逗号进行分隔)
    CREATE TABLE student(
    id string,birthday string,
    grade int,
    memo string) 
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
    
    --建表时,指定字段分隔符为'\t',换行符为'\n'
    CREATE TABLE test(
    id int, 
    name string ,
    tel string)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    LINES TERMINATED BY '\n'
    STORED AS TEXTFILE;
  • 如果表orders不存在,则创建表orders,并且增加表注释和列注释:
    CREATE TABLE IF NOT EXISTS orders (
    orderkey bigint,
    orderstatus varchar,
    totalprice double COMMENT 'Price in cents.',
    orderdate date
    )
    COMMENT 'A table to keep track of orders.';
    insert into orders values
    (202011181113,'online',9527,date '2020-11-11'),
    (202011181114,'online',666,date '2020-11-11'),
    (202011181115,'online',443,date '2020-11-11'),
    (202011181115,'offline',2896,date '2020-11-11');
  • 使用表orders的列定义创建表bigger_orders:
    CREATE TABLE bigger_orders (
    another_orderkey bigint,
    LIKE orders,
    another_orderdate date
    );
    
    SHOW CREATE TABLE bigger_orders ;
                                Create Table                             
    ---------------------------------------------------------------------
     CREATE TABLE hive.default.bigger_orders (                           
        another_orderkey bigint,
        orderkey bigint,
        orderstatus varchar,
        totalprice double,
        ordersdate date,                                             
        another_orderdate date                                           
     )                                                                   
     WITH (                                                              
        external = false,                                                
        format = 'ORC',                                                  
        location = 'hdfs://hacluster/user/hive/warehouse/bigger_orders', 
        orc_compress = 'GZIP',                                           
        orc_compress_size = 262144,                                      
        orc_row_index_stride = 10000,                                    
        orc_stripe_size = 67108864                                       
     )                                                                   
    (1 row)
  • 标号 建表示例:
    CREATE EXTERNAL TABLE hetu_test (orderkey bigint, orderstatus varchar, totalprice double, orderdate date) PARTITIONED BY(ds int) SORT BY (orderkey, orderstatus) COMMENT 'test' STORED AS ORC LOCATION '/user' TBLPROPERTIES (orc_compress = 'SNAPPY', orc_compress_size = 6710422, orc_bloom_filter_columns = 'orderstatus,totalprice');
  • 标号 建表示例:
    CREATE EXTERNAL TABLE hetu_test1 (orderkey bigint, orderstatus varchar, totalprice double, orderdate date) COMMENT 'test' PARTITIONED BY(ds int) CLUSTERED BY (orderkey, orderstatus) SORTED BY (orderkey, orderstatus) INTO 16 BUCKETS STORED AS ORC LOCATION '/user'  TBLPROPERTIES (orc_compress = 'SNAPPY', orc_compress_size = 6710422, orc_bloom_filter_columns = 'orderstatus,totalprice');
  • 标号 建表示例:
    CREATE TABLE hetu_test2 (orderkey bigint, orderstatus varchar, totalprice double, orderdate date, ds int) COMMENT 'This table is in Hetu syntax' WITH (partitioned_by = ARRAY['ds'], bucketed_by = ARRAY['orderkey', 'orderstatus'], sorted_by = ARRAY['orderkey', 'orderstatus'], bucket_count = 16, orc_compress = 'SNAPPY', orc_compress_size = 6710422, orc_bloom_filter_columns = ARRAY['orderstatus', 'totalprice'], external = true, format = 'orc', location = '/user');
  • 查看表的建表语句:
    show create table hetu_test1;
                               Create Table                           
    ------------------------------------------------------------------
     CREATE TABLE hive.default.hetu_test1 (                           
        orderkey bigint,                                              
        orderstatus varchar,                                          
        totalprice double,                                            
        orderdate date,                                               
        ds integer                                                    
     )                                                                
     COMMENT 'test'                                                   
     WITH (                                                           
        bucket_count = 16,                                            
        bucketed_by = ARRAY['orderkey','orderstatus'],                
        bucketing_version = 1,                                        
        external_location = 'hdfs://hacluster/user',                  
        format = 'ORC',                                               
        orc_bloom_filter_columns = ARRAY['orderstatus','totalprice'], 
        orc_bloom_filter_fpp = 5E-2,                                  
        orc_compress = 'SNAPPY',                                      
        orc_compress_size = 6710422,                                  
        orc_row_index_stride = 10000,                                 
        orc_stripe_size = 67108864,                                   
        partitioned_by = ARRAY['ds'],                                 
        sorted_by = ARRAY['orderkey','orderstatus']                   
     )                                                                
    (1 row)

创建分区表

--创建schema
CREATE SCHEMA hive.web WITH (location = 'hdfs://hacluster/user');
--创建分区表
CREATE TABLE hive.web.page_views (
  view_time timestamp,
  user_id bigint,
  page_url varchar,
  ds date,
  country varchar
)
WITH (
  format = 'ORC',
  partitioned_by = ARRAY['ds', 'country'],
  bucketed_by = ARRAY['user_id'],
  bucket_count = 50
);
--插入空的分区
CALL system.create_empty_partition(
    schema_name => 'web',
    table_name => 'page_views',
    partition_columns => ARRAY['ds', 'country'],
    partition_values => ARRAY['2020-07-17', 'US']);
    
CALL system.create_empty_partition(
    schema_name => 'web',
    table_name => 'page_views',
    partition_columns => ARRAY['ds', 'country'],
    partition_values => ARRAY['2020-07-18', 'US']);
    
--查看分区
SELECT * FROM hive.web."page_views$partitions";
     ds     | country 
------------|---------
 2020-07-18 | US      
 2020-07-17 | US    
--插入数据
insert into hive.web.page_views values(timestamp '2020-07-17 23:00:15',bigint '15141','www.local.com',date '2020-07-17','US' );
insert into hive.web.page_views values(timestamp '2020-07-18 23:00:15',bigint '18148','www.local.com',date '2020-07-18','US' );

--查询数据
 select * from hive.web.page_views;
        view_time        | user_id |   page_url    |     ds     | country 
-------------------------|---------|---------------|------------|---------
 2020-07-17 23:00:15.000 |   15141 | www.local.com | 2020-07-17 | US      
2020-07-18 23:00:15.000 |   18148 | www.local.com | 2020-07-18 | US