更新时间:2021-03-18 GMT+08:00
样例参考
本节设计了一个张量加法的Sample,通过综合应用上述TIK优化机制,增强用户对TIK编程与优化的理解。
为保证张量加法Sample的结构清晰性和易读性,本章采用类的形式对Sample进行组织和实现,类的定义如下所示。
class Vadd(): # 接收数据,并完成相关初始化计算 def __init__(self, input_x, input_y, kernel_name="vadd_sample"): # 完成算子计算与编译 def vadd_compute(self): # 定义每个AI Core上的运算 def vadd_compute_each_core(self, move_offset, move_num): # 定义AI Core上的分片计算 def vadd_compute_each_loop(self, move_offset, move_num): # 用于功能和性能测试 def vadd_sample(input_x, input_y, output_z, kernel_name):
完整Sample如下所示。
import math from functools import reduce as functools_reduce import numpy as np from te import tik from te import platform as cce from topi.cce import util class Vadd(): def __init__(self, input_x, input_y, kernel_name="vadd_sample"): self.shape_x = input_x.get("shape") self.dtype_x = input_x.get("dtype") self.shape_y = input_y.get("shape") self.dtype_y = input_y.get("dtype") self.kernel_name = kernel_name self.tik_instance = tik.Tik() self.aicore_num = 2 # Unified Buffer上数据读取和写入必须32B对齐,此参数用来计算tensor划分和数据搬运指令参数 block_bite_size = 32 # 获取Unified Buffer空间大小,单位为bytes ub_size_bytes = te.platform.get_soc_spec("UB_SIZE") # 根据输入的数据类型计算一个block可以存放多少个对应的元素 dtype_bytes_size = cce.cce_intrin.get_bit_len(self.dtype_x) // 8 self.data_each_block = block_bite_size // dtype_bytes_size # 计算在Unified Buffer上给两个输入和计算结果分别分配多少空间(地址重叠),并进行32B对齐 self.ub_tensor_size = ( ub_size_bytes // dtype_bytes_size // 2 // self.data_each_block * self.data_each_block) # 计算输入的元素个数 self.input_num = functools_reduce(lambda x, y: x * y, self.shape_x) # 计算每个aicore需要处理的数据量,当前只考虑均分场景,且均分后32 bytes对齐 self.data_num_each_core = self.input_num // self.aicore_num # vector指令每个repeat最多计算8个block,该参数为mask的最大值 self.vector_mask_max = 8 * self.data_each_block self.input_x_gm = self.tik_instance.Tensor( self.dtype_x, self.shape_x, name="input_x_gm", scope=tik.scope_gm) self.input_y_gm = self.tik_instance.Tensor( self.dtype_x, self.shape_x, name="input_y_gm", scope=tik.scope_gm) self.output_z_gm = self.tik_instance.Tensor( self.dtype_x, self.shape_x, name="output_z_gm", scope=tik.scope_gm) def vadd_compute(self): with self.tik_instance.for_range( 0, self.aicore_num, block_num=self.aicore_num) as index: # 创建两个输入在Unified Buffer上的tensor self.input_x_ub = self.tik_instance.Tensor( self.dtype_x, (self.ub_tensor_size,), name="input_x_ub", scope=tik.scope_ubuf) self.input_y_ub = self.tik_instance.Tensor( self.dtype_y, (self.ub_tensor_size,), name="input_y_ub", scope=tik.scope_ubuf) # 将对应的GM上的数据搬运到Unified Buffer,每次搬运的偏移量为已经处理过的数据个数 move_offset = index * self.data_num_each_core # 每个aicore计算自己负责的数据分片 self.vadd_compute_each_core(move_offset, self.data_num_each_core) self.tik_instance.BuildCCE( kernel_name=self.kernel_name, inputs=[self.input_x_gm, self.input_y_gm], outputs=[self.output_z_gm]) return self.tik_instance def vadd_compute_each_core(self, move_offset, move_num): loop_time = move_num // self.ub_tensor_size if loop_time > 0: with self.tik_instance.for_range(0, loop_time) as loop_index: move_offset = loop_index * self.ub_tensor_size self.vadd_compute_each_loop(move_offset, self.ub_tensor_size) move_offset = loop_time * self.ub_tensor_size last_num = move_num % self.ub_tensor_size if last_num > 0: self.vadd_compute_each_loop(move_offset, last_num) def vadd_compute_each_loop(self, move_offset, move_num): # 计算每次搬运的burst_len burse_len = math.ceil(move_num / self.data_each_block) self.tik_instance.data_move(self.input_x_ub, self.input_x_gm[move_offset], 0, 1, burse_len, 0, 0) self.tik_instance.data_move(self.input_y_ub, self.input_y_gm[move_offset], 0, 1, burse_len, 0, 0) vadd_loop = move_num // (self.vector_mask_max * 255) add_offset = 0 if vadd_loop > 0: with self.tik_instance.for_range(0, vadd_loop) as add_index: add_offset = add_index * self.vector_mask_max * 255 self.tik_instance.vec_add(self.vector_mask_max, self.input_x_ub[add_offset], self.input_x_ub[add_offset], self.input_y_ub[add_offset], 255, 8, 8, 8) add_offset = vadd_loop * vector_mask_max * 255 repeat_time = ( move_num % (self.vector_mask_max * 255) // self.vector_mask_max) if repeat_time > 0: self.tik_instance.vec_add(self.vector_mask_max, self.input_x_ub[add_offset], self.input_x_ub[add_offset], self.input_y_ub[add_offset], repeat_time, 8, 8, 8) add_offset += repeat_time * self.vector_mask_max last_num = move_num % self.vector_mask_max if last_num > 0: self.tik_instance.vec_add(last_num, self.input_x_ub[add_offset], self.input_x_ub[add_offset], self.input_y_ub[add_offset], 1, 8, 8, 8) self.tik_instance.data_move(self.output_z_gm[move_offset], self.input_x_ub, 0, 1, burse_len, 0, 0) @util.check_input_type(dict, dict, dict, str) def vadd_sample(input_x, input_y, output_z, kernel_name): """ calculating data Parameters ---------- input_x : dict shape and dtype of input input_y : dict shape and dtype of input output_z : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "vadd_sample" Returns ------- None """ vadd_instance = Vadd(input_x, input_y, kernel_name) tik_instance = vadd_instance.vadd_compute() return tik_instance
父主题: TIK算子开发
