基于 PytorchQuantization 对语义分割模型量化

本文基于 PytorchQuantization 库对经典的语义分隔模型 deeplapv3 + 进行量化

量化过程概括如下:

基于PytorchQuantization对语义分割模型量化-20250123185653

假设已经进行模型训练,并保存了 pth 模型,如 deeplabv3plus_base.pth,后续基于这个模型做量化处理,并且要保存数据加载方式,因为 ptq 量化需要读取数据上来校准模型

首先定义测量评估函数,用于测量推理耗时和模型效果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
cudnn.benchmark = True
# Helper function to benchmark the model
def benchmark(model, input_shape=(1024, 1, 32, 32), dtype='fp32', nwarmup=50, nruns=800):
input_data = torch.randn(input_shape)
input_data = input_data.to("cuda")
if dtype=='fp16':
input_data = input_data.half()

with torch.no_grad():
for _ in range(nwarmup):
features = model(input_data)
torch.cuda.synchronize()

timings = []
with torch.no_grad():
for i in range(1, nruns+1):
start_time = time.time()
output = model(input_data)
torch.cuda.synchronize()
end_time = time.time()
timings.append(end_time - start_time)
print('Average batch time: %.2f ms,median:%2f-%2f:'%(np.mean(timings)*1000,

ptq 量化

(1)开启量化,并加载模型,此时模型已经被 ptq 量化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
quant_modules.initialize()
q_model=DeepLab(in_channels=3,num_classes=2,pretrained=False)
q_model = q_model.cuda()

ckpt = torch.load("./models/deeplabv3plus_base.pth")
modified_state_dict={}
for key, val in ckpt.items():
# Remove 'module.' from the key names
if key.startswith('module'):
modified_state_dict[key[7:]] = val
else:
modified_state_dict[key] = val

# Load the pre-trained checkpoint
q_model.load_state_dict(modified_state_dict)

(2)读取数据并校准模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def compute_amax(model, **kwargs):
# Load calib result
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
if isinstance(module._calibrator, calib.MaxCalibrator):
module.load_calib_amax()
else:
module.load_calib_amax(**kwargs)
print(F"{name:40}: {module}")
model.cuda()

def collect_stats(model, data_loader, num_batches):
"""Feed data to the network and collect statistics"""
# Enable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.disable_quant()
module.enable_calib()
else:
module.disable()

# Feed data to the network for collecting stats
for i, (image, _) in tqdm(enumerate(data_loader), total=num_batches):
model(image.cuda())
if i >= num_batches:
break

# Disable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.enable_quant()
module.disable_calib()
else:
module.enable()

def calibrate_model(model, model_name, data_loader, num_calib_batch, calibrator, hist_percentile, out_dir):
if num_calib_batch > 0:
print("Calibrating model")
with torch.no_grad():
collect_stats(model, data_loader, num_calib_batch)

if not calibrator == "histogram":
compute_amax(model, method="max")
calib_output = os.path.join(
out_dir,
F"{model_name}-max-{num_calib_batch*data_loader.batch_size}.pth")
torch.save(model.state_dict(), calib_output)
else:
for percentile in hist_percentile:
print(F"{percentile} percentile calibration")
compute_amax(model, method="percentile")
calib_output = os.path.join(
out_dir,
F"{model_name}-percentile-{percentile}-{num_calib_batch*data_loader.batch_size}.pth")
torch.save(model.state_dict(), calib_output)

for method in ["mse", "entropy"]:
print(F"{method} calibration")
compute_amax(model, method=method)
calib_output = os.path.join(
out_dir,
F"{model_name}-{method}-{num_calib_batch*data_loader.batch_size}.pth")
torch.save(model.state_dict(), calib_output)

#Calibrate the model using max calibration technique.
with torch.no_grad():
calibrate_model(
model=q_model,
model_name="deeplabv3plus",
data_loader=train_dataloader,
num_calib_batch=32,
calibrator="max",
hist_percentile=[99.9, 99.99, 99.999, 99.9999],
out_dir="models")

# Save the PTQ model
torch.save(q_model.state_dict(), "./models/deeplabv3plus_ptq.pth")
# evaluate
test_loss, test_acc = evaluate(q_model, val_dataloader, crit)

得到量化模型后,我们尝试使用 tensorrt 将其转为引擎

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# Set static member of TensorQuantizer to use Pytorch’s own fake quantization functions
quant_nn.TensorQuantizer.use_fb_fake_quant = True

q_model=q_model.eval()
# Exporting to Onnx
dummy_input = torch.randn(8, 3, 512, 512, device='cuda')
torch.onnx.export(
q_model,
dummy_input,
"models/deeplabv3plus_ptq.onnx",
verbose=False,
opset_version=13,
input_names=["input"] ,
output_names=["output"] ,
dynamic_axes={'input':{0 : 'batch_size'}, 'out': {0 : 'batch_size'}},
do_constant_folding = False)

# pqt+fp32+int8
!/home/wushaogui/soft/TensorRT-8.0.1.6/bin/trtexec --onnx=models/deeplabv3plus_ptq.onnx --int8 --saveEngine=models/deeplabv3plus_ptq_int8.trt
# pqt+fp32+fp16+int8
!/home/wushaogui/soft/TensorRT-8.0.1.6/bin/trtexec --onnx=models/deeplabv3plus_ptq.onnx --int8 --fp16 --saveEngine=models/deeplabv3plus_ptq_int8_fp16.trt

QAT 量化

QAT 量化在训练中进行,我们在 PTQ 量化基础上,将训练数据重新输入,校准量化参数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def train(model, dataloader, crit, opt):
model.train()
for batch, (data, labels) in enumerate(dataloader):
data, labels = data.cuda(), labels.cuda()
opt.zero_grad()
outputs = model(data)
loss = Focal_Loss(outputs, labels)+Dice_loss(outputs, labels)
loss.backward()
opt.step()

optimizer = torch.optim.Adam(model.parameters(),lr=5e-4)
# optimizer = torch.optim.SGD(model.parameters(),lr=1e-3,momentum=0.9,weight_decay=5e-4)
scheduler=lr_scheduler.StepLR(optimizer,2,0.94)

q_model=q_model.train()
# Finetune the QAT model for 2 epochs
num_epochs=10
for epoch in range(num_epochs):
print('Epoch: [%5d / %5d]' % (epoch + 1, num_epochs))
train(q_model, train_dataloader, crit, optimizer)
test_loss,acc = evaluate(q_model, val_dataloader, crit)
scheduler.step()
print("Test Loss: {:.5f} Test acc {:.2f}%".format(test_loss,acc*100))
# save model
torch.save(q_model.state_dict(), "./models/deeplabv3plus_qat.pth")
# evaluate
test_loss, test_acc = evaluate(q_model, val_dataloader, crit)

同样地,使用 tensorrt 部署 qat 模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# Set static member of TensorQuantizer to use Pytorch’s own fake quantization functions
quant_nn.TensorQuantizer.use_fb_fake_quant = True

q_model=q_model.eval()
# Exporting to ONNX
dummy_input = torch.randn(8, 3, 512, 512, device='cuda')
torch.onnx.export(
q_model,
dummy_input,
"models/deeplabv3plus_qat.onnx",
verbose=False,
opset_version=13,
input_names=["input"] ,
output_names=["output"] ,
dynamic_axes={'input':{0 : 'batch_size'}, 'out': {0 : 'batch_size'}},
do_constant_folding = False)

# qat+fp32+int8
!/home/wushaogui/soft/TensorRT-8.0.1.6/bin/trtexec --onnx=models/deeplabv3plus_qat.onnx --int8 --saveEngine=models/deeplabv3plus_qat_int8.trt
# qat+fp32+fp16+int8
!/home/wushaogui/soft/TensorRT-8.0.1.6/bin/trtexec --onnx=models/deeplabv3plus_qat.onnx --int8 --fp16 --saveEngine=models/deeplabv3plus_qat_int8_fp16.trt

比较结果

定义以下 tensorrt 推理流程,加载不同的 engine 上来推理,并计算推理时间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import cv2
# For torchvision models, input images are loaded in to a range of [0, 1] and
# normalized using mean = [0.485, 0.456, 0.406] and stddev = [0.229, 0.224, 0.225].
def preprocess(image,data_dtype):
image=image.convert('RGB')
image=image.resize((512,512),Image.BICUBIC)
image=np.transpose(preprocess_input(np.array(image,data_dtype)),[2,0,1])

return image

def postprocess(data):
num_classes = 2
# create a color palette, selecting a color for each class
palette = np.array([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 2 - 1])
colors = np.array([palette*i%255 for i in range(num_classes)]).astype("uint8")
# plot the segmentation predictions for 21 classes in different colors
img = Image.fromarray(data.astype('uint8'), mode='P')
img.putpalette(colors)
return img

def load_engine(engine_file_path):
assert os.path.exists(engine_file_path)
print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())

def infer(engine, input_file,data_dtype,batchs):
print("Reading input image from file {}".format(input_file))

input_tensor=[]
for bt in range(batchs):
with Image.open(input_file) as img:
input_image = preprocess(img,data_dtype)
image_width = img.width
image_height = img.height
input_tensor.append(input_image)
input_tensor=np.array(input_tensor)

with engine.create_execution_context() as context:
# Set input shape based on image dimensions for inference
# context.set_binding_shape(engine.get_binding_index("input"), (1, 3, image_height, image_width))
context.set_binding_shape(engine.get_binding_index("input"), (batchs, 3, 512, 512))
# Allocate host and device buffers
bindings = []
for binding in engine:
binding_idx = engine.get_binding_index(binding)
size = trt.volume(context.get_binding_shape(binding_idx))
dtype = trt.nptype(engine.get_binding_dtype(binding))
if engine.binding_is_input(binding):
input_buffer = np.ascontiguousarray(input_tensor)
input_memory = cuda.mem_alloc(input_tensor.nbytes)
bindings.append(int(input_memory))
else:
output_buffer = cuda.pagelocked_empty(size, dtype)
output_memory = cuda.mem_alloc(output_buffer.nbytes)
bindings.append(int(output_memory))

start_time = time.time()
stream = cuda.Stream()
# Transfer input data to the GPU.
cuda.memcpy_htod_async(input_memory, input_buffer, stream)
# Run inference
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# Transfer prediction output from the GPU.
cuda.memcpy_dtoh_async(output_buffer, output_memory, stream)
# Synchronize the stream
stream.synchronize()
end_time = time.time()
print('infer batch time %.3f ms'%((end_time - start_time)*1000))

return output_buffer

def run(engine_file,input_file,data_dtype,batchs):
if not os.path.exists(engine_file):
return np.array(Image.open(input_file))

with load_engine(engine_file) as engine:
for i in range(5): # 运行5次,结果稳定的值才是要的结果
output_buffer=infer(engine, input_file,data_dtype,batchs)
output=np.reshape(output_buffer, (batchs,2,512, 512))
output=np.argmax(output,axis=1)

return output

下图展示了量化前、ptq、qat 量化后的模型效果,可以看出效果差别不大

![[Pasted image 20250123185133.png]]

至于量化后的模型效果,展示如下表

阶段效果 (%)性能 (ms)
Pytorch(base)99.6593.03
pytorch_quantization(base_ptq)99.64164.48
pytorch_quantization(base_qat)99.64114.94
tensorRT(base_fp32)-6.9
tensorRT(base_fp16)-3.3
tensorRT(ptq_int8)-3.3
tensorRT(ptq_int8_fp16)-3.3
tensorRT(qat_int8)-3.3
tensorRT(qat_int8_fp16)-3.3

总结:

  1. 对于 base 和 ptq 或者 qat 量化后的模型,可以看出其效果不变,但是耗时增加了,这是因为模型插入了很多量化算子
  2. 使用 tensorrt 部署后,可以看出量化后确实比未量化 base_fp32 速度快,但是 tensorrt 的量化 base_fp16 的效果足够优秀,没看出在 tensorrt 外进行 ptq、qat 量化的优势