Flink场景
MOR表参数简化并托管表服务
create table hudi_mor (业务字段) with ( 'connector'='hudi', 'path' = 'hdfs://hacluster/tmp/hudi/stream_mor', 'table.type'='MERGE_ON_READ', 'hoodie.datasource.write.recordkey.field'='主键字段', 'hoodie.bucket.index.num.buckets'='20', 'hoodie.managed.by.ldms'='true', 'hive_sync.enable' = 'true', 'hive_sync.table' = '要同步到Hive的表名', 'hive_sync.db' = '要同步到Hive的数据库名', 'hive_sync.mode' = 'hms', 'hive_sync.metastore.uris' = 'Hive客户端hive-site.xml文件中hive.metastore.uris的值', 'properties.hive.metastore.kerberos.principal' = 'Hive客户端hive-site.xml文件中hive.metastore.kerberos.principal的值' )
Flink写MOR表的场景,如果MRS集群有LDMS服务,则可以在Sink Hudi表的With属性里添加hoodie.managed.by.ldms配置(必须同时配置Hive同步),它会自动设置当前MOR表只生成Compaction计划并关闭Clean和Archive,交给LDMS服务来托管该表的Compaction/Clean/Archive维护操作。并且它还会自动补充以下参数去设置write.precombine.field属性为主键字段(联合主键会取第一个作为write.precombine.field)、设置索引为BUCKET、设置写入方式为LSM、设置Parquet文件的压缩格式为zstd以及开启Log文件的Log Index,这些能力会提升读写性能和存储优化。上述建表语句实现的效果等价于:
create table hudi_mor (业务字段) with ( 'connector'='hudi', 'path' = 'hdfs://hacluster/tmp/hudi/stream_mor', 'table.type'='MERGE_ON_READ', 'hoodie.datasource.write.recordkey.field'='主键字段', 'write.precombine.field'='主键字段', 'index.type'='BUCKET', 'hoodie.bucket.index.num.buckets'='20', 'hoodie.managed.by.ldms'='true', 'hoodie.lsm.style'='true', 'hoodie.parquet.compression.codec'='zstd', 'hoodie.log.index.enable'='true', 'hoodie.datasource.write.hive_style_partitioning'='true', 'compaction.delta_commits'='10', 'compaction.async.enabled'='false', 'compaction.schedule.enabled'='true', 'clean.async.enabled'='false', 'hoodie.clean.automatic'='false', 'hoodie.archive.automatic'='false', 'hive_sync.enable' = 'true', 'hive_sync.table' = '要同步到Hive的表名', 'hive_sync.db' = '要同步到Hive的数据库名', 'hive_sync.mode' = 'hms', 'hive_sync.metastore.uris' = 'Hive客户端hive-site.xml文件中hive.metastore.uris的值', 'properties.hive.metastore.kerberos.principal' = 'Hive客户端hive-site.xml文件中hive.metastore.kerberos.principal的值' )
COW表参数简化并托管表服务
create table hudi_cow (业务字段,PRIMARY KEY (主键字段) NOT ENFORCED) with ( 'connector'='hudi', 'path' = 'hdfs://hacluster/tmp/hudi/stream_cow', 'hoodie.managed.by.ldms'='true', 'hive_sync.enable' = 'true', 'hive_sync.table' = '要同步到Hive的表名', 'hive_sync.db' = '要同步到Hive的数据库名', 'hive_sync.mode' = 'hms', 'hive_sync.metastore.uris' = 'Hive客户端hive-site.xml文件中hive.metastore.uris的值', 'properties.hive.metastore.kerberos.principal' = 'Hive客户端hive-site.xml文件中hive.metastore.kerberos.principal的值' )
Flink写COW表的场景,如果MRS集群有LDMS服务,则可以在Sink Hudi表的With属性里添加hoodie.managed.by.ldms配置(必须同时配置Hive同步),它会自动关闭COW表的Clean和Archive,交给LDMS服务来托管该表的Clean和Archive维护操作。并且它还会自动补充以下参数去设置write.precombine.field属性为主键字段(联合主键会取第一个作为write.precombine.field)、设置索引为MEMORY、设置Parquet文件的压缩格式为zstd。上述建表语句实现的效果等价于:
create table hudi_cow (业务字段,PRIMARY KEY (主键字段) NOT ENFORCED) with ( 'connector'='hudi', 'path' = 'hdfs://hacluster/tmp/hudi/stream_cow', 'write.operation' = 'upert', --默认是更新模式,可设置为insert开启append模式 'index.type' = 'MEMORY', 'write.precombine.field'='主键字段', 'hoodie.parquet.compression.codec'='zstd', 'hoodie.managed.by.ldms'='true', 'clean.async.enabled'='false', 'hoodie.clean.automatic'='false', 'hoodie.archive.automatic'='false', 'hive_sync.enable' = 'true', 'hive_sync.table' = '要同步到Hive的表名', 'hive_sync.db' = '要同步到Hive的数据库名', 'hive_sync.mode' = 'hms', 'hive_sync.metastore.uris' = 'Hive客户端hive-site.xml文件中hive.metastore.uris的值', 'properties.hive.metastore.kerberos.principal' = 'Hive客户端hive-site.xml文件中hive.metastore.kerberos.principal的值' )
Append模式参数简化并托管表服务
Append模式写入就使用COW表,再搭配高性能小文件合并工具(使用指南15.2.4.17)和LDMS来自动托管Clean/Archive。
create table hudi_cow (业务字段,PRIMARY KEY (主键字段) NOT ENFORCED) with ( 'connector'='hudi', 'path' = 'hdfs://hacluster/tmp/hudi/stream_cow', 'write.operation' = 'insert', 'hoodie.managed.by.ldms'='true', 'hive_sync.enable' = 'true', 'hive_sync.table' = '要同步到Hive的表名', 'hive_sync.db' = '要同步到Hive的数据库名', 'hive_sync.mode' = 'hms', 'hive_sync.metastore.uris' = 'Hive客户端hive-site.xml文件中hive.metastore.uris的值', 'properties.hive.metastore.kerberos.principal' = 'Hive客户端hive-site.xml文件中hive.metastore.kerberos.principal的值' )