Skip to content

Commit c73708c

Browse files
authored
Fixed gelu, added layernorm. Added timvx version gelu and layernorm (#1415)
* Fixed gelu save_graph error Added SaveTmGeluOp() * Added gelu timvx * Added layernorm operator * Added layernorm timvx
1 parent cb3b6e6 commit c73708c

File tree

15 files changed

+575
-1
lines changed

15 files changed

+575
-1
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* License); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
/*
21+
* Copyright (c) 2021, OPEN AI LAB
22+
* Author: Shijie Chen
23+
*/
24+
25+
#include "layernorm_param.h"
26+
27+
#include "graph/tensor.h"
28+
#include "graph/node.h"
29+
#include "graph/graph.h"
30+
#include "utility/sys_port.h"
31+
#include "utility/float.h"
32+
#include "utility/log.h"
33+
#include "device/cpu/cpu_node.h"
34+
#include "device/cpu/cpu_graph.h"
35+
#include "device/cpu/cpu_module.h"
36+
37+
#include <math.h>
38+
39+
static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
40+
{
41+
return 0;
42+
}
43+
44+
static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
45+
{
46+
return 0;
47+
}
48+
49+
static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
50+
{
51+
return 0;
52+
}
53+
54+
static int ref_layernorm_fp32(struct tensor* input_tensor, struct tensor* output_tensor,
55+
struct tensor* gamma_tensor, struct tensor* beta_tensor, float eps)
56+
{
57+
#if 1
58+
// TIM-VX
59+
int norm_size = input_tensor->dims[input_tensor->dim_num - 1];
60+
int count = 1;
61+
for (int i = 0; i < input_tensor->dim_num - 1; i++)
62+
{
63+
count *= input_tensor->dims[i];
64+
}
65+
#else
66+
// PyTorch
67+
int norm_size = gamma_tensor->elem_num;
68+
int count = input_tensor->elem_num / gamma_tensor->elem_num;
69+
#endif
70+
71+
const float* input_data = (const float*)input_tensor->data;
72+
float* output_data = (float*)output_tensor->data;
73+
74+
const float* gamma_data = (const float*)gamma_tensor->data;
75+
const float* beta_data = (const float*)beta_tensor->data;
76+
77+
for (int i = 0; i < count; i++)
78+
{
79+
float sum = 0.f;
80+
float sqsum = 0.f;
81+
for (int j = 0; j < norm_size; j++)
82+
{
83+
float x = input_data[i * norm_size + j];
84+
sum += x;
85+
sqsum += x * x;
86+
}
87+
float mean = sum / norm_size;
88+
float var = sqsum / norm_size - mean * mean;
89+
float a = 1.0f / sqrtf(var + eps);
90+
float b = -mean * a;
91+
for (int j = 0; j < norm_size; j++)
92+
{
93+
int offset = i * norm_size + j;
94+
output_data[offset] = (input_data[offset] * a + b) * gamma_data[j] + beta_data[j];
95+
}
96+
}
97+
98+
return 0;
99+
}
100+
101+
static int ref_layernorm_uint8(struct tensor* input_tensor, struct tensor* output_tensor,
102+
struct tensor* gamma_tensor, struct tensor* beta_tensor, float eps)
103+
{
104+
#if 1
105+
// TIM-VX
106+
int norm_size = input_tensor->dims[input_tensor->dim_num - 1];
107+
int count = 1;
108+
for (int i = 0; i < input_tensor->dim_num - 1; i++)
109+
{
110+
count *= input_tensor->dims[i];
111+
}
112+
#else
113+
// PyTorch
114+
int norm_size = gamma_tensor->elem_num;
115+
int count = input_tensor->elem_num / gamma_tensor->elem_num;
116+
#endif
117+
118+
int total_size = input_tensor->elem_num;
119+
float* input_data = (float*)sys_malloc(total_size * sizeof(float));
120+
float* output_data = (float*)sys_malloc(total_size * sizeof(float));
121+
122+
// dequant
123+
{
124+
const uint8_t* input_uint8 = (const uint8_t*)input_tensor->data;
125+
float input_scale = input_tensor->scale;
126+
int input_zero = input_tensor->zero_point;
127+
128+
for (int i = 0; i < total_size; i++)
129+
input_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
130+
}
131+
132+
const float* gamma_data = (const float*)gamma_tensor->data;
133+
const float* beta_data = (const float*)beta_tensor->data;
134+
135+
for (int i = 0; i < count; i++)
136+
{
137+
float sum = 0.f;
138+
float sqsum = 0.f;
139+
for (int j = 0; j < norm_size; j++)
140+
{
141+
float x = input_data[i * norm_size + j];
142+
sum += x;
143+
sqsum += x * x;
144+
}
145+
float mean = sum / norm_size;
146+
float var = sqsum / norm_size - mean * mean;
147+
float a = 1.0f / sqrtf(var + eps);
148+
float b = -mean * a;
149+
for (int j = 0; j < norm_size; j++)
150+
{
151+
int offset = i * norm_size + j;
152+
output_data[offset] = (input_data[offset] * a + b) * gamma_data[j] + beta_data[j];
153+
}
154+
}
155+
156+
// quant
157+
{
158+
uint8_t* output_uint8 = (uint8_t*)output_tensor->data;
159+
float output_scale = output_tensor->scale;
160+
int output_zero = output_tensor->zero_point;
161+
for (int i = 0; i < total_size; i++)
162+
{
163+
int udata = (int)roundf(output_data[i] / output_scale + output_zero);
164+
if (udata > 255)
165+
udata = 255;
166+
else if (udata < 0)
167+
udata = 0;
168+
output_uint8[i] = udata;
169+
}
170+
}
171+
172+
sys_free(input_data);
173+
sys_free(output_data);
174+
return 0;
175+
}
176+
177+
static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
178+
{
179+
struct node* node = exec_node->ir_node;
180+
struct graph* graph = node->graph;
181+
182+
struct tensor* input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
183+
struct tensor* gamma_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
184+
struct tensor* beta_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]);
185+
186+
struct tensor* output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
187+
188+
struct layernorm_Param* param = (struct layernorm_Param*)node->op.param_mem;
189+
float eps = param->eps;
190+
191+
int ret = -1;
192+
if (input_tensor->data_type == TENGINE_DT_FP32)
193+
ret = ref_layernorm_fp32(input_tensor, output_tensor, gamma_tensor, beta_tensor, eps);
194+
else if (input_tensor->data_type == TENGINE_DT_UINT8)
195+
ret = ref_layernorm_uint8(input_tensor, output_tensor, gamma_tensor, beta_tensor, eps);
196+
197+
return ret;
198+
}
199+
200+
static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
201+
{
202+
return OPS_SCORE_BEST;
203+
}
204+
205+
static struct node_ops hcl_node_ops = {.prerun = NULL,
206+
.run = run,
207+
.reshape = NULL,
208+
.postrun = NULL,
209+
.init_node = init_node,
210+
.release_node = release_node,
211+
.score = score};
212+
213+
int register_layernorm_ref_op()
214+
{
215+
return register_builtin_node_ops(OP_LAYERNORM, &hcl_node_ops);
216+
}
217+
218+
int unregister_layernorm_ref_op()
219+
{
220+
return unregister_builtin_node_ops(OP_LAYERNORM, &hcl_node_ops);
221+
}

source/device/tim-vx/op/timvx_gelu.cc

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* License); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
/*
21+
* Copyright (c) 2021, Open AI Lab
22+
* Author: Shijie Chen
23+
*/
24+
25+
#include "timvx_executor.hpp"
26+
27+
extern "C"
28+
{
29+
#include "operator/op.h"
30+
}
31+
32+
33+
bool VXEngine::AddGeluNode(struct node* ir_node)
34+
{
35+
struct graph* ir_graph = ir_node->graph;
36+
37+
struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
38+
struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
39+
40+
auto gelu = graph->CreateOperation<tim::vx::ops::Gelu>();
41+
(*gelu)
42+
.BindInputs({ this->vx_tensor_map[input_tensor->index] })
43+
.BindOutputs({ this->vx_tensor_map[output_tensor->index] });
44+
45+
return true;
46+
}
47+
+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* License); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
/*
21+
* Copyright (c) 2021, Open AI Lab
22+
* Author: Shijie Chen
23+
*/
24+
25+
#include "timvx_executor.hpp"
26+
27+
extern "C"
28+
{
29+
#include "operator/op.h"
30+
#include "layernorm_param.h"
31+
}
32+
33+
34+
bool VXEngine::AddLayerNormNode(struct node* ir_node)
35+
{
36+
struct graph* ir_graph = ir_node->graph;
37+
38+
std::vector<std::shared_ptr<tim::vx::Tensor> > bn_in_tensor(ir_node->input_num);
39+
40+
int in_set[3] = {0, 2, 1};
41+
for (int i = 0; i < ir_node->input_num; i++)
42+
{
43+
int idx = in_set[i];
44+
struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[idx]);
45+
bn_in_tensor[i] = this->vx_tensor_map[input_tensor->index];
46+
}
47+
struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
48+
49+
struct layernorm_Param* param = (struct layernorm_Param*)ir_node->op.param_mem;
50+
51+
auto layernorm = graph->CreateOperation<tim::vx::ops::LayerNormalization>(0, param->eps);
52+
(*layernorm)
53+
.BindInputs({ bn_in_tensor })
54+
.BindOutputs({ this->vx_tensor_map[output_tensor->index] });
55+
56+
return true;
57+
}
58+

source/device/tim-vx/timvx_executor.cc

+6
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,12 @@ int VXEngine::Build(struct subgraph* subgraph)
365365
case OP_L2NORMALIZATION:
366366
this->AddL2normalizationNode(ir_node);
367367
break;
368+
case OP_GELU:
369+
this->AddGeluNode(ir_node);
370+
break;
371+
case OP_LAYERNORM:
372+
this->AddLayerNormNode(ir_node);
373+
break;
368374
default:
369375
fprintf(stderr, "Tengine TIM-VX: Cannot support OP(%d).\n", ir_node->index);
370376
break;

source/device/tim-vx/timvx_executor.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ extern "C" {
7979
#include "tim/vx/ops/transpose.h"
8080
#include "tim/vx/ops/spatial_transformer.h"
8181
#include "tim/vx/ops/l2normalization.h"
82+
#include "tim/vx/ops/layernormalization.h"
8283

8384
#define SPEC_TYPE_CONV 1
8485
#define SPEC_TYPE_CONV_BIAS 2
@@ -145,6 +146,8 @@ class VXEngine
145146
bool AddUpsampleNode(struct node* ir_node);
146147
bool AddSpatialtransformerNode(struct node* ir_node);
147148
bool AddL2normalizationNode(struct node* ir_node);
149+
bool AddGeluNode(struct node* ir_node);
150+
bool AddLayerNormNode(struct node* ir_node);
148151

149152
public:
150153
std::shared_ptr<tim::vx::Context> context;

source/device/tim-vx/timvx_limit.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -131,5 +131,7 @@ const int timvx_supported_ops[] = {
131131
// OP_WHERE,
132132
// OP_SOFTPLUS,
133133
// OP_RECIPROCAL,
134+
OP_GELU,
135+
OP_LAYERNORM,
134136
// OP_BUILTIN_LAST
135137
};

source/operator/op.h

+1
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ enum
140140
OP_SPATIALTRANSFORMER,
141141
OP_EXPAND,
142142
OP_GELU,
143+
OP_LAYERNORM,
143144
OP_BUILTIN_LAST
144145
};
145146

source/operator/op_name.h

+1
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,4 @@
127127
#define OP_SPATIALTRANSFORMER_NAME "SpatialTransformer"
128128
#define OP_EXPAND_NAME "Expand"
129129
#define OP_GELU_NAME "Gelu"
130+
#define OP_LAYERNORM_NAME "LayerNorm"

0 commit comments

Comments
 (0)