1 Star 0 Fork 0

原水衣人/sparsity_compiler

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
akg_module.py 14.48 KB
一键复制 编辑 原始数据 按行查看 历史
import os
# os.system('. ~/MindSpore/akg/tests/test_env.sh gpu')
# os.system('export PYTHONPATH=/home/zyy/MindSpore/akg/tests/../third_party/incubator-tvm/python:/home/zyy/MindSpore/akg/tests/../third_party/incubator-tvm/topi:/home/zyy/MindSpore/akg/tests/../third_party/incubator-tvm/topi/python:/home/zyy/MindSpore/akg/tests/..:/home/zyy/MindSpore/akg/tests/../tests/common:/home/zyy/MindSpore/akg/tests/../python:/home/zyy/MindSpore/akg/tests/../tests/operators/gpu:/home/zyy/MindSpore/akg/tests/../tests/fuzz/tune_for_gpu:/home/zyy/tvm/python:')
# os.system('export LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/home/zyy/MindSpore/akg/tests/../build:')
from split import Cube, Spliter, Merger, GPUProfiler
import numpy as np
# from gen_random import random_gaussian
from akg.utils import kernel_exec as utils
from akg.utils.result_analysis import gpu_profiling
from akg.utils.format_transform import to_tvm_nd_array
from akg.ops.math_gpu.batch_matmul import batch_matmul
import akg
import os
from my_utils import generate_mask, render_cubes_list
import time
def translate_layout(data, positions):
'''
data:3D=(B, Y, X)
positions = [(y0, y1, x0, x1), ...]
'''
# M, N = data.shape[-2], data.shape[-1]
# print(M, N)
# print(type(data))
if type(data) == np.ndarray:
blocks = []
for pos in positions:
y0, y1, x0, x1 = pos
blocks.append(data[:,y0:y1,x0:x1])
return blocks
elif type(data) == akg.tvm.ndarray.NDArray:
print('fuccckkk')
exit(-1)
blocks = []
for pos in positions:
x0, x1, y0, y1 = pos
pass
# data_reshape = akg.tvm.nd.reshape(data, (-1, M, N))
class Module(object):
def __init__(self, B, M, N, K):
super().__init__()
self.B = B
self.M = M
self.N = N
self.K = K
self.profiler = {}
# self.time_logger = {} #ms
def compile(self, W, W_mask, I_mask, O_mask, src_block=(1,1,1), dst_block=(1, 1, 1), open_merge=True, autotuning=True, use_database=True):
'''
W: 2D(MxK) or 3D(BxMxK)
W_mask: 2D(MxK)
I_mask: 2D(KxN)
O_mask: 2D(MxN)
'''
device_id = int(os.environ.get("DEVICE_ID", 0))
self.ctx = akg.tvm.context('cuda', device_id)
self.W_mask = W_mask
self.I_mask = I_mask
self.O_mask = O_mask
self.src_block = src_block
self.dst_block = dst_block
self.M = W_mask.shape[0] * src_block[0]
self.N = I_mask.shape[1] * src_block[1]
self.K = W_mask.shape[1] * src_block[2]
st = time.time()
print('spliting...')
spliter = Spliter(W_mask, I_mask, O_mask, src_block, dst_block)
self.cubes_list = spliter.split(autotuning=autotuning, max_num_cubes=100, render=True)
ed = time.time()
self.profiler['split time'] = (ed - st) * 1000
self.profiler['spliter profiler'] = spliter.profiler
self.profiler['number of cubes after spliting'] = len(self.cubes_list)
if open_merge:
# print('----before merge: number of cubes = {}-------'.format(len(self.cubes_list)))
# print(self.cubes_list)
render_cubes_list(self.cubes_list, 'before.png', shape=(self.M, self.N))
st = time.time()
print('merging...')
merger = Merger(self.cubes_list, self.B, use_database)
self.cubes_list = merger.merge()
ed = time.time()
self.profiler['merge time'] = (ed - st) * 1000
self.profiler['merger profiler'] = merger.profiler
# print('----after merge: number of cubes = {}-------'.format(len(self.cubes_list)))
# print(self.cubes_list)
render_cubes_list(self.cubes_list, 'after.png', shape=(self.M, self.N))
weight_positions = [(cube.top, cube.bottom, cube.front, cube.back) for cube in self.cubes_list]
weight_blocks = translate_layout(W, weight_positions)
self.weights = [akg.tvm.nd.array(w, self.ctx) for w in weight_blocks]
st = time.time()
print('building akg ops...')
self.build_akg_ops(self.cubes_list)
ed = time.time()
self.profiler['build akg ops time (after merging)'] = (ed - st) * 1000
sparsity = self.sparse_ratio(W_mask, I_mask, O_mask)
self.profiler['sparsity ratio'] = sparsity
def build_akg_ops(self, cubes_list):
self.mods_dict = {}
counter = 0
for cube in cubes_list:
m, n, k = cube.bottom - cube.top, cube.right - cube.left, cube.back - cube.front
if (m, n, k) in self.mods_dict:
continue
shape1 = (self.B, m, k)
shape2 = (self.B, n, k)
#The dtype is important !!!!!!!!!!!!
dtype = 'float32'
##########################
kernel_name="batch_matmul_{}_{}_{}_{}".format(self.B, m, n, k)
default_attrs = {"target": "cuda"}
mod = utils.op_build_test(batch_matmul, (shape1, shape2), (dtype, dtype), kernel_name=kernel_name, attrs=default_attrs)
self.mods_dict[(m, n, k)] = mod
counter += 1
self.profiler['op_build_test counter'] = counter
def merge_output(self, outputs, cubes_list):
outputs_np = [x.asnumpy() for x in outputs]
output = np.zeros((self.B, self.M, self.N), 'float32')
for i in range(len(cubes_list)):
left, right, top, bottom = cubes_list[i].left, cubes_list[i].right, cubes_list[i].top, cubes_list[i].bottom
output[:, top:bottom, left:right] += outputs_np[i]
mask = np.repeat(self.O_mask, self.src_block[0], axis=0)
mask = np.repeat(mask, self.src_block[1], axis=1)
output[:,np.logical_not(mask)] = 0
return output
def run(self, I, profiling=False):
'''
I:2D(NxK) or 3D(BxNxK)
'''
print(self.cubes_list)
self.outputs = [np.zeros((self.B, cube.bottom-cube.top, cube.right-cube.left), dtype='float32') for cube in self.cubes_list]
self.outputs = [akg.tvm.nd.array(x, self.ctx) for x in self.outputs]
# for i in range(len(self.outputs)):
# self.outputs[i] = akg.tvm.nd.array(self.outputs[i], self.ctx)
'''
TODO:input.translate_layout should be done in GPU global memory
'''
input_positions = [(cube.left, cube.right, cube.front, cube.back) for cube in self.cubes_list]
inputs_np = translate_layout(I, input_positions)
self.inputs = [akg.tvm.nd.array(x, self.ctx) for x in inputs_np]
# Id = akg.tvm.nd.array(I, self.ctx)
# self.inputs = translate_layout(Id, input_positions)
self.profiler['kernel computation'] = 0
for i,cube in enumerate(self.cubes_list):
m, n, k = cube.bottom - cube.top, cube.right - cube.left, cube.back - cube.front
mod = self.mods_dict[(m, n, k)]
mod_args = [self.weights[i], self.inputs[i], self.outputs[i]]
# print('---------zyy------------')
# print(i)
# print(m,n,k)
# print(mod)
# print(mod_args)
# for a in mod_args:
# print(a)
mod(*mod_args)
# print(mod_args)
# print(mod_args[2].asnumpy())
if profiling:
self.profiler['kernel computation'] += GPUProfiler(shape=(self.B, m, n, k), ctx=self.ctx).profile()
# self.profiler['kernel computation'] += gpu_profiling(mod, *mod_args, repeat_time=200) * 1000
start = time.time()
self.output = self.merge_output(self.outputs, self.cubes_list)
end = time.time()
self.profiler['merge output'] = (end - start) * 1000
return self.output
def __call__(self, W, I, W_mask, I_mask, O_mask, profiling=False):
self.compile(W, W_mask, I_mask, O_mask)
return self.run(I, profiling)
def test_non_split(self, W_np, I_np, profiling=False):
self.build_akg_ops([Cube(0, self.N, 0, self.M, 0, self.K)])
I = akg.tvm.nd.array(I_np, self.ctx)
W = akg.tvm.nd.array(W_np, self.ctx)
O = akg.tvm.nd.array(np.zeros((self.B, self.M, self.N), dtype='float32'), self.ctx)
# print('--------hello test non split---------------')
mod = self.mods_dict[(self.M, self.N, self.K)]
mod_args = [W, I, O]
mod(*mod_args)
if profiling:
# for i in range(10):
# t = gpu_profiling(mod, *mod_args, repeat_time=200) * 1000
# print(i, t)
self.profiler['non split time'] = GPUProfiler(shape=(self.B, self.M, self.N, self.K), ctx=self.ctx).profile()
output = O.asnumpy()
mask = np.repeat(self.O_mask, self.src_block[0], axis=0)
mask = np.repeat(mask, self.src_block[1], axis=1)
output[:,np.logical_not(mask)] = 0
t_split = self.profiler['kernel computation']
t_nonsplit = self.profiler['non split time']
self.profiler['accelerate ratio'] = 1 - t_split / (t_nonsplit + 0.0000000001)
return output
def check_result_by_numpy(self, W_np, I_np, O_result):
O_np = np.einsum("bik, bjk -> bij", W_np, I_np)
mask = np.repeat(self.O_mask, self.src_block[0], axis=0)
mask = np.repeat(mask, self.src_block[1], axis=1)
O_np[:,np.logical_not(mask)] = 0
print('Testing....')
np.testing.assert_allclose(O_np, O_result, rtol=1e-3)
print('Test pass!')
def sparse_ratio(self, W_mask, I_mask, O_mask):
r1 = np.sum(W_mask) / np.size(W_mask)
r2 = np.sum(I_mask) / np.size(I_mask)
r3 = np.sum(O_mask) / np.size(O_mask)
return 1 - r1 * r2 * r3
def test_stride_pattern(L=None):
#1.这个case应该可以代表最复杂的那一类pattern了,用我们的方法仍可以加速
# 可以测试一下open_merge关闭之后的性能
# batch = 64
# num_head = 8
# seq_len = 512
# hidden_size = 64
# src_block = (2, 2, 8)
# dst_block = (2, 2, 8)
# O_mask = generate_mask(seq_len // src_block[0], num_local_atten=2, num_global_atten=1, stride=4, pattern='stride')
#2.同上
#(a)最后几次的merge step可以用来做个demo
# (b)这个case下,假如删掉bmm_database.pkl重新跑,每次跑出来的结果不一样
# 原因:profile的时间不准确有波动,造成融合时的策略选择不同
batch = 1
num_head = 8
seq_len = 256
hidden_size = 64
if L != None:
seq_len = L
src_block = (16, 16, hidden_size)
dst_block = (16, 16, hidden_size)
O_mask = generate_mask(seq_len // src_block[0], num_local_atten=4, num_global_atten=1, stride=4, pattern='stride')
# print(O_mask.astype(np.int))
W_np = np.random.uniform(size=(batch*num_head, seq_len, hidden_size)).astype(np.float32)
I_np = np.random.uniform(size=(batch*num_head, seq_len, hidden_size)).astype(np.float32)
W_mask = np.ones((seq_len//src_block[0], hidden_size//src_block[2]), dtype=bool)
I_mask = np.ones((hidden_size//src_block[2], seq_len//src_block[0]), dtype=bool)
batch_matmul_akg = Module(batch*num_head, seq_len, seq_len, hidden_size)
batch_matmul_akg.compile(W_np, W_mask, I_mask, O_mask, src_block=src_block, dst_block=dst_block, open_merge=True, autotuning=False, use_database=False)
O_akg = batch_matmul_akg.run(I_np, profiling=True)
O_akg_nonsplit = batch_matmul_akg.test_non_split(W_np, I_np, profiling=True)
batch_matmul_akg.check_result_by_numpy(W_np, I_np, O_akg)
batch_matmul_akg.check_result_by_numpy(W_np, I_np, O_akg_nonsplit)
print(batch_matmul_akg.profiler)
p = batch_matmul_akg.profiler
info = [p['merger profiler']['number of cubes before merging'],
p['merger profiler']['edge counter'],
p['merger profiler']['build akg op counter'],
p['merge time'],
p['merger profiler']['build akg op time'],
p['split time']
]
# print(info)
return info
#or
#O_akg = batch_matmul_akg(W_np, I_np, W_mask, I_mask, O_mask)
def test_fixed_pattern():
####################################################
# 1.in this case, open_merge will harm the performance
# why?讲道理不是每次p1 = merge(p2), 则performance(p1) > performance(p2),都是局部最优啊,不至于比初始性能还差吧
# 猜测:profile的时间不准确有波动,造成本来融合没啥收益的cube融合的cost也为正了
# 猜测似乎不对
# batch = 1
# num_head = 64
# seq_len = 128
# hidden_size = 64
# src_block = dst_block = (4, 4, hidden_size)
# O_mask = generate_mask(seq_len // src_block[0], num_local_atten=4, num_global_atten=1, pattern='fix')
####################################################
# 2. 情况和上面的case一样
batch = 1
num_head = 4
seq_len = 512
hidden_size = 64
src_block = (16, 16, hidden_size)
dst_block = (16, 16, hidden_size)
O_mask = generate_mask(seq_len // src_block[0], num_local_atten=4, num_global_atten=1, pattern='fix')
# batch = 1
# num_head = 4
# seq_len = 256
# hidden_size = 64
# src_block = (4, 4, hidden_size)
# dst_block = (4, 4, hidden_size)
# O_mask = generate_mask(seq_len // src_block[0], num_local_atten=4, num_global_atten=1, pattern='fix')
W_np = np.random.uniform(size=(batch*num_head, seq_len, hidden_size)).astype(np.float32)
I_np = np.random.uniform(size=(batch*num_head, seq_len, hidden_size)).astype(np.float32)
W_mask = np.ones((seq_len//src_block[0], hidden_size//src_block[2]), dtype=bool)
I_mask = np.ones((hidden_size//src_block[2], seq_len//src_block[0]), dtype=bool)
batch_matmul_akg = Module(batch*num_head, seq_len, seq_len, hidden_size)
batch_matmul_akg.compile(W_np, W_mask, I_mask, O_mask, src_block=src_block, dst_block=dst_block, open_merge=True, use_database=False)
O_akg = batch_matmul_akg.run(I_np, profiling=True)
O_akg_nonsplit = batch_matmul_akg.test_non_split(W_np, I_np, profiling=True)
# batch_matmul_akg.check_result_by_numpy(W_np, I_np, O_akg)
# batch_matmul_akg.check_result_by_numpy(W_np, I_np, O_akg_nonsplit)
print(batch_matmul_akg.profiler)
#or
#O_akg = batch_matmul_akg(W_np, I_np, W_mask, I_mask, O_mask)
if __name__ == '__main__':
# test_fixed_pattern()
test_stride_pattern()
# info_list = []
# for L in range(128, 2048, 128):
# print('------------------L={}--------------------'.format(L))
# info = test_stride_pattern(L)
# info_list.append(info)
# print(info_list)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/MondayYuan/sparsity_compiler.git
git@gitee.com:MondayYuan/sparsity_compiler.git
MondayYuan
sparsity_compiler
sparsity_compiler
main

搜索帮助