基于 PytorchQuantization 对语义分割模型量化

发表于 2025-01-17 更新于 2025-02-02 分类于 2-深度学习， D-深度学习部署阅读次数：本文字数： 11k 阅读时长 ≈ 10 分钟

本文基于 PytorchQuantization 库对经典的语义分隔模型 deeplapv3 + 进行量化

量化过程概括如下：

基于PytorchQuantization对语义分割模型量化-20250123185653

假设已经进行模型训练，并保存了 pth 模型，如 deeplabv3plus_base.pth，后续基于这个模型做量化处理，并且要保存数据加载方式，因为 ptq 量化需要读取数据上来校准模型

首先定义测量评估函数，用于测量推理耗时和模型效果

cudnn.benchmark = True
# Helper function to benchmark the model
def benchmark(model, input_shape=(1024, 1, 32, 32), dtype='fp32', nwarmup=50, nruns=800):
    input_data = torch.randn(input_shape)
    input_data = input_data.to("cuda")
    if dtype=='fp16':
        input_data = input_data.half()
        
    with torch.no_grad():
        for _ in range(nwarmup):
            features = model(input_data)
    torch.cuda.synchronize()

    timings = []
    with torch.no_grad():
        for i in range(1, nruns+1):
            start_time = time.time()
            output = model(input_data)
            torch.cuda.synchronize()
            end_time = time.time()
            timings.append(end_time - start_time)
    print('Average batch time: %.2f ms,median:%2f-%2f:'%(np.mean(timings)*1000,

ptq 量化

（1）开启量化，并加载模型，此时模型已经被 ptq 量化

quant_modules.initialize()
q_model=DeepLab(in_channels=3,num_classes=2,pretrained=False)
q_model = q_model.cuda()

ckpt = torch.load("./models/deeplabv3plus_base.pth")
modified_state_dict={}
for key, val in ckpt.items():
    # Remove 'module.' from the key names
    if key.startswith('module'):
        modified_state_dict[key[7:]] = val
    else:
        modified_state_dict[key] = val

# Load the pre-trained checkpoint
q_model.load_state_dict(modified_state_dict)

（2）读取数据并校准模型

def compute_amax(model, **kwargs):
    # Load calib result
    for name, module in model.named_modules():
        if isinstance(module, quant_nn.TensorQuantizer):
            if module._calibrator is not None:
                if isinstance(module._calibrator, calib.MaxCalibrator):
                    module.load_calib_amax()
                else:
                    module.load_calib_amax(**kwargs)
            print(F"{name:40}: {module}")
    model.cuda()

def collect_stats(model, data_loader, num_batches):
    """Feed data to the network and collect statistics"""
    # Enable calibrators
    for name, module in model.named_modules():
        if isinstance(module, quant_nn.TensorQuantizer):
            if module._calibrator is not None:
                module.disable_quant()
                module.enable_calib()
            else:
                module.disable()

    # Feed data to the network for collecting stats
    for i, (image, _) in tqdm(enumerate(data_loader), total=num_batches):
        model(image.cuda())
        if i >= num_batches:
            break

    # Disable calibrators
    for name, module in model.named_modules():
        if isinstance(module, quant_nn.TensorQuantizer):
            if module._calibrator is not None:
                module.enable_quant()
                module.disable_calib()
            else:
                module.enable()
             
def calibrate_model(model, model_name, data_loader, num_calib_batch, calibrator, hist_percentile, out_dir):
    if num_calib_batch > 0:
        print("Calibrating model")
        with torch.no_grad():
            collect_stats(model, data_loader, num_calib_batch)

        if not calibrator == "histogram":
            compute_amax(model, method="max")
            calib_output = os.path.join(
                out_dir,
                F"{model_name}-max-{num_calib_batch*data_loader.batch_size}.pth")
            torch.save(model.state_dict(), calib_output)
        else:
            for percentile in hist_percentile:
                print(F"{percentile} percentile calibration")
                compute_amax(model, method="percentile")
                calib_output = os.path.join(
                    out_dir,
                    F"{model_name}-percentile-{percentile}-{num_calib_batch*data_loader.batch_size}.pth")
                torch.save(model.state_dict(), calib_output)

            for method in ["mse", "entropy"]:
                print(F"{method} calibration")
                compute_amax(model, method=method)
                calib_output = os.path.join(
                    out_dir,
                    F"{model_name}-{method}-{num_calib_batch*data_loader.batch_size}.pth")
                torch.save(model.state_dict(), calib_output)

#Calibrate the model using max calibration technique.
with torch.no_grad():
    calibrate_model(
        model=q_model,
        model_name="deeplabv3plus",
        data_loader=train_dataloader,
        num_calib_batch=32,
        calibrator="max",
        hist_percentile=[99.9, 99.99, 99.999, 99.9999],
        out_dir="models")

# Save the PTQ model
torch.save(q_model.state_dict(), "./models/deeplabv3plus_ptq.pth")
# evaluate
test_loss, test_acc = evaluate(q_model, val_dataloader, crit)

得到量化模型后，我们尝试使用 tensorrt 将其转为引擎

# Set static member of TensorQuantizer to use Pytorch’s own fake quantization functions
quant_nn.TensorQuantizer.use_fb_fake_quant = True

q_model=q_model.eval()
# Exporting to Onnx
dummy_input = torch.randn(8, 3, 512, 512, device='cuda')
torch.onnx.export(
    q_model,
    dummy_input,
    "models/deeplabv3plus_ptq.onnx",
    verbose=False,
    opset_version=13,
    input_names=["input"] ,
    output_names=["output"] , 
    dynamic_axes={'input':{0 : 'batch_size'}, 'out': {0 : 'batch_size'}},
    do_constant_folding = False)

# pqt+fp32+int8
!/home/wushaogui/soft/TensorRT-8.0.1.6/bin/trtexec --onnx=models/deeplabv3plus_ptq.onnx --int8 --saveEngine=models/deeplabv3plus_ptq_int8.trt   
# pqt+fp32+fp16+int8
!/home/wushaogui/soft/TensorRT-8.0.1.6/bin/trtexec --onnx=models/deeplabv3plus_ptq.onnx --int8 --fp16 --saveEngine=models/deeplabv3plus_ptq_int8_fp16.trt

QAT 量化

QAT 量化在训练中进行，我们在 PTQ 量化基础上，将训练数据重新输入，校准量化参数

def train(model, dataloader, crit, opt):
    model.train()
    for batch, (data, labels) in enumerate(dataloader):
        data, labels = data.cuda(), labels.cuda()
        opt.zero_grad()
        outputs = model(data)
        loss = Focal_Loss(outputs, labels)+Dice_loss(outputs, labels)
        loss.backward()
        opt.step()

optimizer = torch.optim.Adam(model.parameters(),lr=5e-4)
# optimizer = torch.optim.SGD(model.parameters(),lr=1e-3,momentum=0.9,weight_decay=5e-4)
scheduler=lr_scheduler.StepLR(optimizer,2,0.94)

q_model=q_model.train()
# Finetune the QAT model for 2 epochs
num_epochs=10
for epoch in range(num_epochs):
    print('Epoch: [%5d / %5d]' % (epoch + 1, num_epochs))
    train(q_model, train_dataloader, crit, optimizer)
    test_loss,acc = evaluate(q_model, val_dataloader, crit)
    scheduler.step()
    print("Test Loss: {:.5f} Test acc {:.2f}%".format(test_loss,acc*100))
# save model
torch.save(q_model.state_dict(), "./models/deeplabv3plus_qat.pth")
# evaluate
test_loss, test_acc = evaluate(q_model, val_dataloader, crit)

同样地，使用 tensorrt 部署 qat 模型

# Set static member of TensorQuantizer to use Pytorch’s own fake quantization functions
quant_nn.TensorQuantizer.use_fb_fake_quant = True

q_model=q_model.eval()
# Exporting to ONNX
dummy_input = torch.randn(8, 3, 512, 512, device='cuda')
torch.onnx.export(
    q_model,
    dummy_input,
    "models/deeplabv3plus_qat.onnx",
    verbose=False,
    opset_version=13,
    input_names=["input"] ,
    output_names=["output"] , 
    dynamic_axes={'input':{0 : 'batch_size'}, 'out': {0 : 'batch_size'}},
    do_constant_folding = False)

# qat+fp32+int8
!/home/wushaogui/soft/TensorRT-8.0.1.6/bin/trtexec --onnx=models/deeplabv3plus_qat.onnx --int8 --saveEngine=models/deeplabv3plus_qat_int8.trt   
# qat+fp32+fp16+int8
!/home/wushaogui/soft/TensorRT-8.0.1.6/bin/trtexec --onnx=models/deeplabv3plus_qat.onnx --int8 --fp16 --saveEngine=models/deeplabv3plus_qat_int8_fp16.trt

比较结果

定义以下 tensorrt 推理流程，加载不同的 engine 上来推理，并计算推理时间

import cv2
# For torchvision models, input images are loaded in to a range of [0, 1] and
# normalized using mean = [0.485, 0.456, 0.406] and stddev = [0.229, 0.224, 0.225].
def preprocess(image,data_dtype):
    image=image.convert('RGB')
    image=image.resize((512,512),Image.BICUBIC)
    image=np.transpose(preprocess_input(np.array(image,data_dtype)),[2,0,1])

    return image

def postprocess(data):
    num_classes = 2
    # create a color palette, selecting a color for each class
    palette = np.array([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 2 - 1])
    colors = np.array([palette*i%255 for i in range(num_classes)]).astype("uint8")
    # plot the segmentation predictions for 21 classes in different colors
    img = Image.fromarray(data.astype('uint8'), mode='P')
    img.putpalette(colors)
    return img

def load_engine(engine_file_path):
    assert os.path.exists(engine_file_path)
    print("Reading engine from file {}".format(engine_file_path))
    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())
    
def infer(engine, input_file,data_dtype,batchs):
    print("Reading input image from file {}".format(input_file))
    
    input_tensor=[]
    for bt in range(batchs):
        with Image.open(input_file) as img:
            input_image = preprocess(img,data_dtype)
            image_width = img.width
            image_height = img.height
            input_tensor.append(input_image)
    input_tensor=np.array(input_tensor)

    with engine.create_execution_context() as context:
        # Set input shape based on image dimensions for inference
        # context.set_binding_shape(engine.get_binding_index("input"), (1, 3, image_height, image_width))
        context.set_binding_shape(engine.get_binding_index("input"), (batchs, 3, 512, 512))
        # Allocate host and device buffers
        bindings = []
        for binding in engine:
            binding_idx = engine.get_binding_index(binding)
            size = trt.volume(context.get_binding_shape(binding_idx))
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            if engine.binding_is_input(binding):
                input_buffer = np.ascontiguousarray(input_tensor)
                input_memory = cuda.mem_alloc(input_tensor.nbytes)
                bindings.append(int(input_memory))
            else:
                output_buffer = cuda.pagelocked_empty(size, dtype)
                output_memory = cuda.mem_alloc(output_buffer.nbytes)
                bindings.append(int(output_memory))
        
        start_time = time.time()
        stream = cuda.Stream()
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(input_memory, input_buffer, stream)
        # Run inference
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        # Transfer prediction output from the GPU.
        cuda.memcpy_dtoh_async(output_buffer, output_memory, stream)
        # Synchronize the stream
        stream.synchronize()
        end_time = time.time()
        print('infer batch time %.3f ms'%((end_time - start_time)*1000))
    
    return output_buffer

def run(engine_file,input_file,data_dtype,batchs):
    if not os.path.exists(engine_file):
        return np.array(Image.open(input_file))
    
    with load_engine(engine_file) as engine:
        for i in range(5): # 运行5次，结果稳定的值才是要的结果
            output_buffer=infer(engine, input_file,data_dtype,batchs)
        output=np.reshape(output_buffer, (batchs,2,512, 512))
        output=np.argmax(output,axis=1)
        
        return output

下图展示了量化前、ptq、qat 量化后的模型效果，可以看出效果差别不大

![[Pasted image 20250123185133.png]]

至于量化后的模型效果，展示如下表

阶段	效果 (%)	性能 (ms)
Pytorch(base)	99.65	93.03
pytorch_quantization(base_ptq)	99.64	164.48
pytorch_quantization(base_qat)	99.64	114.94
tensorRT(base_fp32)	-	6.9
tensorRT(base_fp16)	-	3.3
tensorRT(ptq_int8)	-	3.3
tensorRT(ptq_int8_fp16)	-	3.3
tensorRT(qat_int8)	-	3.3
tensorRT(qat_int8_fp16)	-	3.3

总结：

对于 base 和 ptq 或者 qat 量化后的模型，可以看出其效果不变，但是耗时增加了，这是因为模型插入了很多量化算子
使用 tensorrt 部署后，可以看出量化后确实比未量化 base_fp32 速度快，但是 tensorrt 的量化 base_fp16 的效果足够优秀，没看出在 tensorrt 外进行 ptq、qat 量化的优势