更新时间:2021-03-18 GMT+08:00
分享

样例参考

本节设计了一个张量加法的Sample,通过综合应用上述TIK优化机制,增强用户对TIK编程与优化的理解。

学习完本节的张量加法Sample后,您可以参考样例参考了解更多TIK算子样例。

为保证张量加法Sample的结构清晰性和易读性,本章采用类的形式对Sample进行组织和实现,类的定义如下所示。

class Vadd():
       # 接收数据,并完成相关初始化计算
       def __init__(self, input_x, input_y, kernel_name="vadd_sample"):
       
       # 完成算子计算与编译
       def vadd_compute(self):
       
       # 定义每个AI Core上的运算
       def vadd_compute_each_core(self, move_offset, move_num):
       
       # 定义AI Core上的分片计算
       def vadd_compute_each_loop(self, move_offset, move_num):
       
       # 用于功能和性能测试
def vadd_sample(input_x, input_y, output_z, kernel_name):

完整Sample如下所示。

import math
from functools import reduce as functools_reduce
import numpy as np
from te import tik
from te import platform as cce
from topi.cce import util

class Vadd():
    def __init__(self, input_x, input_y, kernel_name="vadd_sample"):
        self.shape_x = input_x.get("shape")
        self.dtype_x = input_x.get("dtype")
        self.shape_y = input_y.get("shape")
        self.dtype_y = input_y.get("dtype")
        self.kernel_name = kernel_name
        self.tik_instance = tik.Tik()
        self.aicore_num = 2
 
        # Unified Buffer上数据读取和写入必须32B对齐,此参数用来计算tensor划分和数据搬运指令参数
        block_bite_size = 32
        # 获取Unified Buffer空间大小,单位为bytes
        ub_size_bytes = te.platform.get_soc_spec("UB_SIZE")
 
        # 根据输入的数据类型计算一个block可以存放多少个对应的元素
        dtype_bytes_size = cce.cce_intrin.get_bit_len(self.dtype_x) // 8
        self.data_each_block = block_bite_size // dtype_bytes_size
 
        # 计算在Unified Buffer上给两个输入和计算结果分别分配多少空间(地址重叠),并进行32B对齐
        self.ub_tensor_size = (
            ub_size_bytes // dtype_bytes_size // 2 // self.data_each_block *
            self.data_each_block)
 
        # 计算输入的元素个数
        self.input_num = functools_reduce(lambda x, y: x * y, self.shape_x)
 
        # 计算每个aicore需要处理的数据量,当前只考虑均分场景,且均分后32 bytes对齐
        self.data_num_each_core = self.input_num // self.aicore_num
 
        # vector指令每个repeat最多计算8个block,该参数为mask的最大值
        self.vector_mask_max = 8 * self.data_each_block
 
        self.input_x_gm = self.tik_instance.Tensor(
            self.dtype_x, self.shape_x, name="input_x_gm", scope=tik.scope_gm)
        self.input_y_gm = self.tik_instance.Tensor(
            self.dtype_x, self.shape_x, name="input_y_gm", scope=tik.scope_gm)
        self.output_z_gm = self.tik_instance.Tensor(
            self.dtype_x, self.shape_x, name="output_z_gm", scope=tik.scope_gm)
 
    def vadd_compute(self):
        with self.tik_instance.for_range(
                0, self.aicore_num, block_num=self.aicore_num) as index:
            # 创建两个输入在Unified Buffer上的tensor
            self.input_x_ub = self.tik_instance.Tensor(
                self.dtype_x, (self.ub_tensor_size,),
                name="input_x_ub",
                scope=tik.scope_ubuf)
            self.input_y_ub = self.tik_instance.Tensor(
                self.dtype_y, (self.ub_tensor_size,),
                name="input_y_ub",
                scope=tik.scope_ubuf)
 
            # 将对应的GM上的数据搬运到Unified Buffer,每次搬运的偏移量为已经处理过的数据个数
            move_offset = index * self.data_num_each_core
 
            # 每个aicore计算自己负责的数据分片
            self.vadd_compute_each_core(move_offset, self.data_num_each_core)
 
        self.tik_instance.BuildCCE(
            kernel_name=self.kernel_name,
            inputs=[self.input_x_gm, self.input_y_gm],
            outputs=[self.output_z_gm])
 
        return self.tik_instance
 
    def vadd_compute_each_core(self, move_offset, move_num):
        loop_time = move_num // self.ub_tensor_size
        if loop_time > 0:
            with self.tik_instance.for_range(0, loop_time) as loop_index:
                move_offset = loop_index * self.ub_tensor_size
                self.vadd_compute_each_loop(move_offset, self.ub_tensor_size)
            move_offset = loop_time * self.ub_tensor_size
 
        last_num = move_num % self.ub_tensor_size
        if last_num > 0:
            self.vadd_compute_each_loop(move_offset, last_num)
 
    def vadd_compute_each_loop(self, move_offset, move_num):
        # 计算每次搬运的burst_len
        burse_len = math.ceil(move_num / self.data_each_block)
 
        self.tik_instance.data_move(self.input_x_ub,
                                    self.input_x_gm[move_offset], 0, 1,
                                    burse_len, 0, 0)
        self.tik_instance.data_move(self.input_y_ub,
                                    self.input_y_gm[move_offset], 0, 1,
                                    burse_len, 0, 0)
        vadd_loop = move_num // (self.vector_mask_max * 255)
        add_offset = 0
        if vadd_loop > 0:
            with self.tik_instance.for_range(0, vadd_loop) as add_index:
                add_offset = add_index * self.vector_mask_max * 255
                self.tik_instance.vec_add(self.vector_mask_max,
                                       self.input_x_ub[add_offset],
                                       self.input_x_ub[add_offset],
                                       self.input_y_ub[add_offset], 
                                       255, 8, 8, 8)
            add_offset = vadd_loop * vector_mask_max * 255
        repeat_time = (
            move_num % (self.vector_mask_max * 255) // self.vector_mask_max)
        if repeat_time > 0:
            self.tik_instance.vec_add(self.vector_mask_max,
                                   self.input_x_ub[add_offset],
                                   self.input_x_ub[add_offset],
                                   self.input_y_ub[add_offset], 
                                   repeat_time, 8, 8, 8)
            add_offset += repeat_time * self.vector_mask_max
        last_num = move_num % self.vector_mask_max
        if last_num > 0:
            self.tik_instance.vec_add(last_num, 
                                   self.input_x_ub[add_offset],
                                   self.input_x_ub[add_offset],
                                   self.input_y_ub[add_offset], 
                                   1, 8, 8, 8)
 
        self.tik_instance.data_move(self.output_z_gm[move_offset],
                                    self.input_x_ub, 0, 1, burse_len, 0, 0)
 
 
@util.check_input_type(dict, dict, dict, str)
def vadd_sample(input_x, input_y, output_z, kernel_name):
    """
    calculating data
 
    Parameters
    ----------
    input_x : dict
        shape and dtype of input
    input_y : dict
        shape and dtype of input
    output_z : dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        kernel name, default value is "vadd_sample"
 
    Returns
    -------
    None
    """
    vadd_instance = Vadd(input_x, input_y, kernel_name)
    tik_instance = vadd_instance.vadd_compute()
 
    return tik_instance
分享:

    相关文档

    相关产品

close