# model settings norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth', # noqa backbone=dict( type='VisionTransformer', img_size=(512, 512), patch_size=16, in_channels=3, embed_dims=768, num_layers=12, num_heads=12, mlp_ratio=4, out_indices=(2, 5, 8, 11), qkv_bias=True, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, with_cls_token=True, norm_cfg=dict(type='LN', eps=1e-6), act_cfg=dict(type='GELU'), norm_eval=False, out_shape='NCHW', interpolate_mode='bicubic'), neck=dict( type='MultiLevelNeck', in_channels=[768, 768, 768, 768], out_channels=768, scales=[4, 2, 1, 0.5]), decode_head=dict( type='UPerHead', in_channels=[768, 768, 768, 768], in_index=[0, 1, 2, 3], pool_scales=(1, 2, 3, 6), channels=512, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=768, in_index=3, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=19, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole')) # yapf: disable