更新时间:2021-03-18 GMT+08:00
分享

算子实现

本章节介绍样例中算子实现中的关键功能点。

算子代码实现

Add算子仅支持float16, float32, int32三种数据类型,所以需要对算子的输入数据进行校验;由于Add算子允许两个输入数据的shape不同,但算子计算接口te.lang.cce.vadd( )要求两输入shape相同,因此需要对算子两个输入的shape进行广播并对其进行校验,算子实现代码示例如下所示:

“tbe/impl/add.py”

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
SHAPE_SIZE_LIMIT = 2147483648
# 比较两个输入的每个维度的大小,取每个维度的大值,生成out_shape
def _produce_shapes(shape1, shape2):
    shape1 = list(shape1)
    shape2 = list(shape2)
    flag = 0
    if len(shape1) < len(shape2):
        shape1, shape2 = shape2, shape1
        flag = 1

    output_shape_len = len(shape1)
    dec = output_shape_len - len(shape2)
    for i in range(dec):
        shape2 = [1] + shape2

    out_shape = []
    for i in range(output_shape_len):
        if (shape1[i] != shape2[i]) and (shape1[i] != 1) and (shape2[i] != 1):
            raise RuntimeError("input shapes not match!")
        out_shape.append(shape1[i] if shape1[i] > shape2[i] else shape2[i])

    if flag == 1:
        shape1, shape2 = shape2, shape1

    return shape1, shape2, out_shape

# 将shape转换为list
def _shape_to_list(shape):
    result = []
    for i in shape:
        if isinstance(i, tvm.expr.Var):
            result.append(i)
        else:
            result.append(i.value)
    return result

# 实现Add算子的计算逻辑
@fusion_manager.register("add")
def add_compute(input_x, input_y, output_z, kernel_name="add"):
    shape_x = _shape_to_list(input_x.shape)
    shape_y = _shape_to_list(input_y.shape)
    shape_x, shape_y, shape_max = _produce_shapes(shape_x, shape_y)   # shape_max取shape_x与shape_y的每个维度的大值
    shape_size = reduce(lambda x, y: x * y, shape_max[:])      
    if shape_size > SHAPE_SIZE_LIMIT:
        raise RuntimeError("the shape is too large to calculate")

    input_x = te.lang.cce.broadcast(input_x, shape_max)       # 将input_x的shape广播为shape_max
    input_y = te.lang.cce.broadcast(input_y, shape_max)       # 将input_y的shape广播为shape_max
    res = te.lang.cce.vadd(input_x, input_y)        # 执行input_x + input_y

    return res          # 返回计算结果的tensor

# 算子定义函数
def add(input_x, input_y, output_z, kernel_name="add"):
    # 获取算子输入tensor的shape与dtype
    shape_x = input_x.get("shape")      
    shape_y = input_y.get("shape")
    check_tuple = ("float16", "float32", "int32")
    input_data_type = input_x.get("dtype").lower()
    if input_data_type not in check_tuple:
        raise RuntimeError("only support %s while dtype is %s" %
                           (",".join(check_tuple), input_data_type))
    
    # 将shape_x与shape_y做broadcast,为后续使用placeholder进行tensor占位准备
    shape_x, shape_y, shape_max = _produce_shapes(shape_x, shape_y)  
    if shape_x[-1] == 1 and shape_y[-1] == 1 and shape_max[-1] == 1: 
        # 如果shape的长度等于1,就直接赋值;如果shape的长度不等于1,但最后一个维度的值为1,将最后一个维度舍弃(按照内存排布,最后一个维度为1与没有最后一个维度的数据排布相同,例如2*3=2*3*1,将最后一个为1的维度舍弃,可提升后续的调度效率)。
        shape_x = shape_x if len(shape_x) == 1 else shape_x[:-1]   
        shape_y = shape_y if len(shape_y) == 1 else shape_y[:-1]
        shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1]

  
    # 使用TVM的placeholder接口对第一个输入tensor进行占位,返回一个tensor对象
    data_x = tvm.placeholder(shape_x, name="data_1", dtype=input_data_type)
    # 使用TVM的placeholder接口对第二个输入tensor进行占位,返回一个tensor对象
    data_y = tvm.placeholder(shape_y, name="data_2", dtype=input_data_type)

    # 调用compute实现函数
    res = add_compute(data_x, data_y, output_z, kernel_name)  
    # 自动调度
    with tvm.target.cce():
        schedule = generic.auto_schedule(res)
    # 编译配置
    config = {"name": kernel_name,
              "tensor_list": (data_x, data_y, res)}
    te.lang.cce.cce_build_code(schedule, config)

算子适配插件实现

将原始Tensorflow的Add算子解析并映射为适配昇腾AI处理器的Add算子,算子属性的映射可直接调用AutoMappingFn( )接口进行实现,完整代码可参考sample样例中的“framework/tf_plugin/add_plugin.cpp”文件。

算子原型定义

原型定义的关键点是推理输出Tensor的shape以及对算子输入的内在关联关系进行校验。

Add算子推理输出shape的原理为:首先获取两个输入的shape,然后将两个输入shape广播为相同的shape,输出shape取两个输入中每个维度的大值。代码如下所示:

“op_proto/add.cpp”

bool InferShapeAndTypeAdd(Operator& op, const string& input_name1, const string& input_name2, const string& output_name) {
  // vOutputDesc.push_back(op.GetInputDesc(0));
  TensorDesc vOutputDesc = op.GetOutputDesc(output_name);

  DataType input_dtype = op.GetInputDesc(input_name1).GetDataType();
  Format input_format = op.GetInputDesc(input_name1).GetFormat();
  // 针对shape维度大小进行交换
  ge::Shape shapeX = op.GetInputDesc(input_name1).GetShape();
  ge::Shape shapeY = op.GetInputDesc(input_name2).GetShape();
  std::vector<int64_t> dimsX = shapeX.GetDims();
  std::vector<int64_t> dimsY = shapeY.GetDims();
  if (dimsX.size() < dimsY.size()) {
    std::vector<int64_t> dimsTmp = dimsX;
    dimsX = dimsY;
    dimsY = dimsTmp;
  }

  // 对小的shape进行1补齐
  if (dimsX.size() != dimsY.size()) {
    int dec = dimsX.size() - dimsY.size();
    for (int i = 0; i < dec; i++) {
      dimsY.insert(dimsY.begin(), (int64_t)1);
    }
  }

  // 设置输出的shape维度,取两个输入shape中每个维度的大值
  std::vector<int64_t> dimVec;
  for (size_t i = 0; i < dimsX.size(); i++) {
    if ((dimsX[i] != dimsY[i]) && (dimsX[i] != 1) && (dimsY[i] != 1)) {
      return false;
    }

    int64_t dims = dimsX[i] > dimsY[i] ? dimsX[i] : dimsY[i];
    dimVec.push_back(dims);
  }
  ge::Shape outputShape = ge::Shape(dimVec);

  vOutputDesc.SetShape(outputShape);
  vOutputDesc.SetDataType(input_dtype);
  vOutputDesc.SetFormat(input_format);
  op.UpdateOutputDesc(output_name, vOutputDesc);

  return true;
}

算子信息定义

Add算子的信息定义文件请参见“tbe/op_info_cfg/ai_core/<soc_version>/add.ini”。

分享:

    相关文档

    相关产品

close