创建Hudi表

注意事项

DGC执行SparkSQL分为代理连接和API连接，在使用API连接执行Hudi相关的命令时需要在作业的运行中参数中添加"--conf spark.support.hudi=true"。
Hudi表的字段名称不能大写，因为Spark/Flink/Hive/HetuEngine引擎之间有大小写兼容性差异。
Hudi表必须有主键，主键字段的值不能包含逗号、冒号、Null以及空值，建表后无法修改。
Hudi表必须有precombine字段，precombine字段不能包含Null以及空值。precombine只能设置为Hudi表中的一个列，建表后无法修改，注意这个字段涉及到Hudi表的更新逻辑，只有新数据这个列的值大于等于旧数据这个列的值才会更新。
Hudi表的索引有默认值，MRS 3.3.1之前版本是BLOOM索引, MRS 3.5.0及之后版本是SIMPLE索引，建议在建表时显示指定适合业务的索引，建表后无法修改。
如果使用bucket索引，必须按照确定表索引章节去预估bucket桶数，建表后无法修改。
建表语句中指定location就是外表，不指定location就是内表。对内表执行drop table会删除hive上的表以及数据存储目录。对外表执行drop table仅会删除hive上的表，不删除数据存储目录，使用"drop table 表名 purge"可以删除hive上的表以及数据存储目录，谨慎使用。

DGC/SparkSQL

单主键Hudi表。

create table hudi_table (
id int,
name string,
price double
) using hudi
options (
type  = 'cow',
primaryKey = 'id', --必须指定主键
preCombineField = 'id', --必须指定precombine字段，通常和主键设置为同一个字段就可以实现按主键更新
hoodie.index.type = 'SIMPLE' --不指定时将使用默认索引
);

多主键Hudi表。

create table hudi_table (
id1 int,
id2 int,
name string,
price double
) using hudi
options (
type  = 'mor',
primaryKey = 'id1,id2', --必须指定主键，联合主键数量无限制，使用逗号分隔。
preCombineField = 'id1', --必须指定precombine字段，precombine字段只能设置一个列
hoodie.index.type = 'BLOOM' --不指定时将使用默认索引
);

BUCKET索引Hudi表。

create table hudi_table (
id1 int,
id2 int,
name string,
price double
) using hudi
options (
type  = 'mor',
primaryKey = 'id1,id2', --必须指定主键，联合主键数量无限制，使用逗号分隔。
preCombineField = 'id1', --必须指定precombine字段，precombine字段只能设置一个列
hoodie.index.type = 'BUCKET', --必须指定
hoodie.bucket.index.num.buckets = '5', --必须指定，bucket桶数必须按照6.2.2章节去预估
hoodie.bucket.index.hash.field = 'id1,id2' --可选，bucket索引的hash字段默认和主键保持一致，通常不需要设置
);

分区表。

create table hudi_table (
id1 int,
id2 int,
par1 int,
par2 int,
name string,
price double
) using hudi
options (
......
) partitioned by (par1, par2);

外表。

create table hudi_table (
id1 int,
id2 int,
par1 int,
par2 int,
name string,
price double
) using hudi
options (
......
) partitioned by (par1, par2) location "hdfs://.../hudi_table"; -- hdfs路径或者obs路径