Skip to main content

layoutLM3训练问题解决

/rxhui/anaconda3/envs/layoutlmv3/lib/python3.7/site-packages/detectron2/modeling/roi_heads/cascade_rcnn.py

CascadeROIHeads

-1.调试

{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [

{

"python":"/rxhui/anaconda3/envs/layoutlmv3/bin/python3.7",
"name": "LayoutLMV3",
"type": "python",
"request": "launch",
"program": "/rxhui/lxj/workspace/project/unilm-master/layoutlmv3/examples/object_detection/train_net.py",
"console": "integratedTerminal",
"env": {
"CUDA_VISIBLE_DEVICES": "0"
},
"justMyCode": false,
"args": [
"--config-file",
"/rxhui/lxj/workspace/project/unilm-master/layoutlmv3/examples/object_detection/cascade_layoutlmv3.yaml",
//"--eval-only",
"--num-gpus",
"4",
]
},
]
}

0.环境配置

严格按照如下配置,特别是torch1.10,且尽量cuda等于11.1

unilm/layoutlmv3 at master · microsoft/unilm · GitHub

pip install setuptools==59.5.0
pip install protobuf==3.20.1 -i https://pypi.douban.com/simple/

【448,448】 -> (1,3,1088,768)

【224,224】 -> (1,3,1088,768)

1.数据集配置

/data1/lxj/workspace/layout/unilm/layoutlmv3/examples/object_detection/train_net.py

中修改注册数据集,COCO数据集

    #注释其他数据集

register_coco_instances("publaynet_train", {}, "/data1/lxj/workspace/layout/dataset/COCO/train/annotations.json", "/data1/lxj/workspace/layout/dataset/COCO/train")
register_coco_instances("publaynet_val", {}, "/data1/lxj/workspace/layout/dataset/COCO/val/annotations.json", "/data1/lxj/workspace/layout/dataset/COCO/val")



2.训练

/data1/lxj/workspace/layout/unilm/layoutlmv3/examples/object_detection/cascade_layoutlmv3.yaml

NUM_CLASSES: 10

IMS_PER_BATCH: 1

cd /data1/lxj/workspace/layout/unilm/layoutlmv3/examples/object_detection


python train_net.py --config-file cascade_layoutlmv3.yaml --num-gpus 4 MODEL.WEIGHTS /data1/lxj/workspace/layout/model/layoutlmv3-base-chinese/pytorch_model.bin OUTPUT_DIR /data1/lxj/workspace/layout/output/12-14

3.batchsize以及step

源论文:

16GPU(具体不知道)

PubLayNet数据集:335,703图像

batchsize:32

step:60000,1,000 warmup

eopch约6轮

420 2080ti:

CDLA_DATASET:5000训练图像

4张卡一共能设置最大batchsize:4

step:60000,1,000 warmup

4.测试

测试中需要json文件

python train_net.py --config-file cascade_layoutlmv3.yaml --eval-only --num-gpus 4 \
MODEL.WEIGHTS /data1/lxj/workspace/layout/output/model_final.pth \
OUTPUT_DIR /data1/lxj/workspace/layout/output/val

5.网络问题

export https_proxy=http://127.0.0.1:7890 http_proxy=http://127.0.0.1:7890 all_proxy=socks5://127.0.0.1:7890

6.多模态训练方法

1.更改backbone.py

/data1/lxj/workspace/layout/unilm/layoutlmv3/examples/object_detection/ditod/backbone.py

#self.backbone = LayoutLMv3Model(config, detection=True,
# out_features=out_features, image_only=image_only)
#将image_only改为false
self.backbone = LayoutLMv3Model(config, detection=True,
out_features=out_features,image_only=False)

2.更改rcnn_vl.py

/data1/lxj/workspace/layout/unilm/layoutlmv3/examples/object_detection/ditod/rcnn_vl.py

def allgen(examples,device):
align_size = max([i['input_id'].shape[0] for i in examples])
for i in range(len(examples)):
if len(examples[i]['input_id']) < align_size:
pad_size = align_size - len(examples[i]['input_id'])
examples[i]['input_id'] = torch.cat((examples[i]['input_id'], torch.ones(pad_size).to(device)))

# 对 box 进行填充
if len(examples[i]['box']) < align_size:
pad_size = align_size - len(examples[i]['box'])
padding = torch.zeros(pad_size, 4)
examples[i]['box'] = torch.cat((examples[i]['box'], padding.to(device)))
return examples

#将get_batch函数替换
def get_batch(self, examples, images):

if len(examples) >= 1 and "bbox" not in examples[0]: # image_only
#需要与最大的长度对齐 使用1 和[0,0,0,0],
examples = allgen(examples)
ids = torch.stack([i['input_id'].long() for i in examples],dim=0).to(images.device)

box = torch.stack([i['box'].long() for i in examples],dim=0).to(images.device)

return {"images": images.tensor,"input_ids":ids,"bbox":box}

return input

3.更改dataset_mapper.py

/data1/lxj/workspace/layout/unilm/layoutlmv3/examples/object_detection/ditod/dataset_mapper.py

import os
import json
from transformers import AutoTokenizer

def normalize_box(box, width, height):
return [
max(int(1000 * (box[0] / width)),1000),
max(int(1000 * (box[1] / height)),1000),
max(int(1000 * ((box[0] + box[2]) / width)),1000),
max(int(1000 * ((box[1] + box[3]) / height)),1000),
]
def read_OCRFile(image_path,reverseLen,removeStart = False):
file_name, _ = os.path.splitext(os.path.basename(image_path))
# 构建新路径,省略"JPEGImages"文件夹
json_path = os.path.join(os.path.dirname(os.path.dirname(image_path)), "OCR", file_name + ".json")

# 读取 JSON 文件
with open(json_path, 'r', encoding='utf-8') as json_file:
data = json.load(json_file)
#一个words对应一个box
box = [normalize_box(i,data['W'],data['H']) for i in data['bbox']]
resbox = []
input_idx = []
tokenizer=AutoTokenizer.from_pretrained("/rxhui/lxj/workspace/project/unilm-master/layoutlmv3/examples/layout-Chinese")
for li in range(len(box)):
tokens=tokenizer.tokenize(data['words'][li])
token_ids = tokenizer.convert_tokens_to_ids(tokens)

cur_CNT = min(len(tokens),reverseLen)

for wod in range(cur_CNT):
#如果以6开始则remove
if removeStart and token_ids[wod] == 6 and cur_CNT > 1:
continue
resbox.append(box[li])
input_idx.append(token_ids[wod])
if len(input_idx) == 0: #如果长度为0,则
return [1],[[0,0,0,0]]
elif len(input_idx) > 300: #超出token限制则截断
input_idx = input_idx[:300]
resbox = resbox[:300]
return input_idx,resbox



input_id,box = read_OCRFile(dataset_dict["file_name"],3,True) #3是取前几个token,True是第一个token为6则去掉
dataset_dict["input_id"] = torch.tensor(input_id) #这里是输入的ids
dataset_dict["box"] = torch.tensor(box)#这里是输入的box

4.中文的token分词器

from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained("yuyijiong/layoutlmv3-base-chinese-xfund")
tokens=tokenizer.tokenize("你好,我叫李兴杰")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

#tokenizer.convert_tokens_to_string(tokens)
#tokenizer.encode(sentence)
#tokenizer.decode(token_ids,skip_special_tokens=False)

5.OCR

import os
from PIL import Image
import json
from paddleocr import PaddleOCR, draw_ocr
ocr = PaddleOCR(use_angle_cls=True, lang="ch")
def perform_ocr(image_path):
# 假设这里调用你的OCR函数并返回bbox和words列表
# 实际情况需要替换为你的OCR函数调用
print("process:"+image_path)
result = ocr.ocr(image_path, cls=True)
result = result[0]

bbox=[[line[0][0][0],line[0][0][1],line[0][2][0]-line[0][0][0],line[0][2][1]-line[0][0][1]] for line in result]

words = [line[1][0] for line in result] # 示例words
return bbox, words

def process_images(folder_path):
for filename in os.listdir(folder_path):
if filename.endswith(('.jpg', '.jpeg', '.png')): # 假设你处理的是图片文件
image_path = os.path.join(folder_path, filename)

# 调用OCR函数
bbox, words = perform_ocr(image_path)

# 获取图片分辨率
with Image.open(image_path) as img:
w, h = img.size

# 构建结果字典
result = {
"bbox": bbox,
"words": words,
"W": w,
"H": h
}

# 保存为JSON文件
json_filename = os.path.splitext(filename)[0] + ".json"
json_path = os.path.join(folder_path, json_filename)

with open(json_path, 'w',encoding='utf-8') as json_file:
json.dump(result, json_file, ensure_ascii=False, indent=2)

# 替换为你的文件夹路径
folder_path = r"D:\pythonproject\lmv3-test"
process_images(folder_path)
# bbox,words=perform_ocr("img2.jpg")
print(1)

6.如何将VIT的方式转换为CNN的特征提取

#假设batch大小为(3),img大小为(3,1056,736)
#vit -> 窗口大小16,编码之后大小为(3,3036,768),Hp = 1056 / 16 ,Wp = 768 / 16
#最后img—embeddeding(3,3036,768)concat cls—token(3,1,768) -> (3,3037,768)
#(3,3037,768) + position(1,197,768) = (3,3037,768)

#加上多模态编码 emmbeding(3,201,768) -》 hidden_states(3,3828,768)

#hidden_states(3,3828,768),Hp = 1056 / 16 ,Wp = 768 / 16,xp(3,768,66,46)
xp = hidden_states[:, -Hp*Wp:, :].permute(0, 2, 1).reshape(len(hidden_states), -1, Hp, Wp) # (3,768,66,46)
#有了xp则可以通过FPN网络提取卷积特征feat_out,然后传出
feat_out[self.out_features[j]] = self.ops[j](xp.contiguous())

7.把图像转换为patch

转换为patch然后transformer做

import torch.nn as nn
import torch.nn.functional as F
from timm.models.layers import to_2tuple

class PatchEmbed(nn.Module):
""" Image to Patch Embedding
"""
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
# The following variables are used in detection mycheckpointer.py
self.num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
self.num_patches_w = self.patch_shape[0]
self.num_patches_h = self.patch_shape[1]

def forward(self, x, position_embedding=None):
x = self.proj(x)

if position_embedding is not None:
# interpolate the position embedding to the corresponding size
position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(0, 3, 1, 2)
Hp, Wp = x.shape[2], x.shape[3]
position_embedding = F.interpolate(position_embedding, size=(Hp, Wp), mode='bicubic')
x = x + position_embedding

x = x.flatten(2).transpose(1, 2)
return x

8.将box与词汇编码在高层特征相加

embedding_feat 为(batch,boxsize,768)

CNNfeat 为(batch,768,W,H)

#在/rxhui/lxj/workspace/project/unilm-master/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py中
#将embedding_feat 与CNNfeat 融合
if self.detection:
return encoder_outputs
def joinEmmbed(embedding_feat,CNNfeat,bbox):
batch_size, n, _ = bbox.size()
# 初始化目标张量 x
process_box = (bbox / 100).long() #将 1000映射到10 * 10
for batch_idx in range(batch_size):
for bbox_idx in range(n):
# 获取当前 bbox 的坐标
bx = process_box[batch_idx][bbox_idx].cpu().numpy()
if not np.all(bx == 0):
x[batch_idx,bx[0]:bx[2] + 1,bx[1]:bx[3] + 1,:] += CNNfeat[batch_idx][bbox_idx]
else:
break

x = x.permute(0,3,1, 2)





import torch
import numpy as np


bbox = torch.tensor([[[0,20,20,40],[100,200,200,1000],[0,0,0,0]],[[0,20,20,40],[100,200,200,400],[0,0,0,0]]])
batch_size, n, _ = bbox.size()
embedding_feat = torch.randn(batch_size,n,768)
CNNfeat = torch.randn(batch_size,768,23,29)

def joinEmmbed(embedding_feat,CNNfeat,bbox):
batch_size, n, _ = bbox.size()
bbox = bbox.float() / 1000
#print(bbox)
CW ,CH = CNNfeat.shape[2],CNNfeat.shape[3]

bbox[:,:,0] = bbox[:,:,0] * CW
bbox[:,:,2] = bbox[:,:,2] * CW
bbox[:,:,1] = bbox[:,:,1] * CH
bbox[:,:,3] = bbox[:,:,3] * CH
bbox = bbox.long()
CNNfeat = CNNfeat.permute(0,2,3,1)
for batch_idx in range(batch_size):
for bbox_idx in range(n):
# 获取当前 bbox 的坐标
bx = bbox[batch_idx][bbox_idx].cpu().numpy()
if not np.all(bx == 0):
CNNfeat[batch_idx,bx[0]:bx[2] + 1,bx[1]:bx[3] + 1,:] += embedding_feat[batch_idx][bbox_idx]
else:
break
CNNfeat =CNNfeat.permute(0,3,1, 2)
return CNNfeat

joinEmmbed(embedding_feat,CNNfeat,bbox).shape

#在/rxhui/lxj/workspace/project/unilm-master/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py中

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch LayoutLMv3 model. """
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers import apply_chunking_to_forward
from transformers.modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
MaskedLMOutput,
TokenClassifierOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
from transformers.models.roberta.modeling_roberta import (
RobertaIntermediate,
RobertaLMHead,
RobertaOutput,
RobertaSelfOutput,
)
from transformers.utils import logging
import numpy as np
from .configuration_layoutlmv3 import LayoutLMv3Config
from timm.models.layers import to_2tuple


logger = logging.get_logger(__name__)


class PatchEmbed(nn.Module):
""" Image to Patch Embedding
"""
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
# The following variables are used in detection mycheckpointer.py
self.num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
self.num_patches_w = self.patch_shape[0]
self.num_patches_h = self.patch_shape[1]

def forward(self, x, position_embedding=None):
x = self.proj(x)

if position_embedding is not None:
# interpolate the position embedding to the corresponding size
position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(0, 3, 1, 2)
Hp, Wp = x.shape[2], x.shape[3]
position_embedding = F.interpolate(position_embedding, size=(Hp, Wp), mode='bicubic')
x = x + position_embedding

x = x.flatten(2).transpose(1, 2)
return x

class LayoutLMv3Embeddings(nn.Module):
"""
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
"""

# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))

# End copy
self.padding_idx = config.pad_token_id
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
)

self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)

def _calc_spatial_position_embeddings(self, bbox):
try:
#print(torch.max(bbox),torch.min(bbox))
assert torch.all(0 <= bbox) and torch.all(bbox <= 1023)
left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
except IndexError as e:
raise IndexError("The :obj:`bbox` coordinate values should be within 0-1000 range.") from e

h_position_embeddings = self.h_position_embeddings(torch.clip(bbox[:, :, 3] - bbox[:, :, 1], 0, 1023))
w_position_embeddings = self.w_position_embeddings(torch.clip(bbox[:, :, 2] - bbox[:, :, 0], 0, 1023))

# below is the difference between LayoutLMEmbeddingsV2 (torch.cat) and LayoutLMEmbeddingsV1 (add)
spatial_position_embeddings = torch.cat(
[
left_position_embeddings,
upper_position_embeddings,
right_position_embeddings,
lower_position_embeddings,
h_position_embeddings,
w_position_embeddings,
],
dim=-1,
)
return spatial_position_embeddings

def create_position_ids_from_input_ids(self, input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
x: torch.Tensor x:

Returns: torch.Tensor
"""
# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx

def forward(
self,
input_ids=None,
bbox=None,
token_type_ids=None,
position_ids=None,
inputs_embeds=None,
past_key_values_length=0,
):
if position_ids is None:
if input_ids is not None:
# Create the position ids from the input token ids. Any padded tokens remain padded.
position_ids = self.create_position_ids_from_input_ids(
input_ids, self.padding_idx, past_key_values_length).to(input_ids.device)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]

if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)

embeddings = inputs_embeds + token_type_embeddings
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings

spatial_position_embeddings = self._calc_spatial_position_embeddings(bbox)

embeddings = embeddings + spatial_position_embeddings

embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings

def create_position_ids_from_inputs_embeds(self, inputs_embeds):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
inputs_embeds: torch.Tensor≈

Returns: torch.Tensor
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]

position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape)


class LayoutLMv3PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""

config_class = LayoutLMv3Config
base_model_prefix = "layoutlmv3"

# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)


class LayoutLMv3SelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)

self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size

self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)

self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.has_relative_attention_bias = config.has_relative_attention_bias
self.has_spatial_attention_bias = config.has_spatial_attention_bias

def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)

def cogview_attn(self, attention_scores, alpha=32):
'''
https://arxiv.org/pdf/2105.13290.pdf
Section 2.4 Stabilization of training: Precision Bottleneck Relaxation (PB-Relax).
A replacement of the original nn.Softmax(dim=-1)(attention_scores)
Seems the new attention_probs will result in a slower speed and a little bias
Can use torch.allclose(standard_attention_probs, cogview_attention_probs, atol=1e-08) for comparison
The smaller atol (e.g., 1e-08), the better.
'''
scaled_attention_scores = attention_scores / alpha
max_value = scaled_attention_scores.amax(dim=(-1)).unsqueeze(-1)
# max_value = scaled_attention_scores.amax(dim=(-2, -1)).unsqueeze(-1).unsqueeze(-1)
new_attention_scores = (scaled_attention_scores - max_value) * alpha
return nn.Softmax(dim=-1)(new_attention_scores)

def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
rel_pos=None,
rel_2d_pos=None,
):
mixed_query_layer = self.query(hidden_states)

# If this is instantiated as a cross-attention module, the keys
# and values come from an encoder; the attention mask needs to be
# such that the encoder's padding tokens are not attended to.
is_cross_attention = encoder_hidden_states is not None

if is_cross_attention and past_key_value is not None:
# reuse k,v, cross_attentions
key_layer = past_key_value[0]
value_layer = past_key_value[1]
attention_mask = encoder_attention_mask
elif is_cross_attention:
key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
attention_mask = encoder_attention_mask
elif past_key_value is not None:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
else:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))

query_layer = self.transpose_for_scores(mixed_query_layer)

# Take the dot product between "query" and "key" to get the raw attention scores.
# The attention scores QT K/√d could be significantly larger than input elements, and result in overflow.
# Changing the computational order into QT(K/√d) alleviates the problem. (https://arxiv.org/pdf/2105.13290.pdf)
attention_scores = torch.matmul(query_layer / math.sqrt(self.attention_head_size), key_layer.transpose(-1, -2))

if self.has_relative_attention_bias and self.has_spatial_attention_bias:
attention_scores += (rel_pos + rel_2d_pos) / math.sqrt(self.attention_head_size)
elif self.has_relative_attention_bias:
attention_scores += rel_pos / math.sqrt(self.attention_head_size)

# if self.has_relative_attention_bias:
# attention_scores += rel_pos
# if self.has_spatial_attention_bias:
# attention_scores += rel_2d_pos

# attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
attention_scores = attention_scores + attention_mask

# Normalize the attention scores to probabilities.
# attention_probs = nn.Softmax(dim=-1)(attention_scores) # comment the line below and use this line for speedup
attention_probs = self.cogview_attn(attention_scores) # to stablize training
# assert torch.allclose(attention_probs, nn.Softmax(dim=-1)(attention_scores), atol=1e-8)

# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)

# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask

context_layer = torch.matmul(attention_probs, value_layer)

context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)

outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

return outputs


class LayoutLMv3Attention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = LayoutLMv3SelfAttention(config)
self.output = RobertaSelfOutput(config)
self.pruned_heads = set()

def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)

# Prune linear layers
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

# Update hyper params and store pruned heads
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)

def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
rel_pos=None,
rel_2d_pos=None,
):
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs


class LayoutLMv3Layer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = LayoutLMv3Attention(config)
assert not config.is_decoder and not config.add_cross_attention, \
"This version do not support decoder. Please refer to RoBERTa for implementation of is_decoder."
self.intermediate = RobertaIntermediate(config)
self.output = RobertaOutput(config)

def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
rel_pos=None,
rel_2d_pos=None,
):
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
)
attention_output = self_attention_outputs[0]

outputs = self_attention_outputs[1:] # add self attentions if we output attention weights

layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs

return outputs

def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output


class LayoutLMv3Encoder(nn.Module):
def __init__(self, config, detection=False, out_features=None):
super().__init__()
self.config = config
self.detection = detection
self.layer = nn.ModuleList([LayoutLMv3Layer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False

self.has_relative_attention_bias = config.has_relative_attention_bias
self.has_spatial_attention_bias = config.has_spatial_attention_bias

if self.has_relative_attention_bias:
self.rel_pos_bins = config.rel_pos_bins
self.max_rel_pos = config.max_rel_pos
self.rel_pos_onehot_size = config.rel_pos_bins
self.rel_pos_bias = nn.Linear(self.rel_pos_onehot_size, config.num_attention_heads, bias=False)

if self.has_spatial_attention_bias:
self.max_rel_2d_pos = config.max_rel_2d_pos
self.rel_2d_pos_bins = config.rel_2d_pos_bins
self.rel_2d_pos_onehot_size = config.rel_2d_pos_bins
self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False)
self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False)

if self.detection:
self.gradient_checkpointing = True
embed_dim = self.config.hidden_size
self.out_features = out_features
self.out_indices = [int(name[5:]) for name in out_features]
self.fpn1 = nn.Sequential(
nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
# nn.SyncBatchNorm(embed_dim),
nn.BatchNorm2d(embed_dim),
nn.GELU(),
nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
)

self.fpn2 = nn.Sequential(
nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
)

self.fpn3 = nn.Identity()

self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)
self.ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]

def relative_position_bucket(self, relative_position, bidirectional=True, num_buckets=32, max_distance=128):
ret = 0
if bidirectional:
num_buckets //= 2
ret += (relative_position > 0).long() * num_buckets
n = torch.abs(relative_position)
else:
n = torch.max(-relative_position, torch.zeros_like(relative_position))
# now n is in the range [0, inf)

# half of the buckets are for exact increments in positions
max_exact = num_buckets // 2
is_small = n < max_exact

# The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
val_if_large = max_exact + (
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).to(torch.long)
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))

ret += torch.where(is_small, n, val_if_large)
return ret

def _cal_1d_pos_emb(self, hidden_states, position_ids, valid_span):
VISUAL_NUM = 196 + 1

rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)

if valid_span is not None:
# for the text part, if two words are not in the same line,
# set their distance to the max value (position_ids.shape[-1])
rel_pos_mat[(rel_pos_mat > 0) & (valid_span == False)] = position_ids.shape[1]
rel_pos_mat[(rel_pos_mat < 0) & (valid_span == False)] = -position_ids.shape[1]

# image-text, minimum distance
rel_pos_mat[:, -VISUAL_NUM:, :-VISUAL_NUM] = 0
rel_pos_mat[:, :-VISUAL_NUM, -VISUAL_NUM:] = 0

rel_pos = self.relative_position_bucket(
rel_pos_mat,
num_buckets=self.rel_pos_bins,
max_distance=self.max_rel_pos,
)
rel_pos = F.one_hot(rel_pos, num_classes=self.rel_pos_onehot_size).type_as(hidden_states)
rel_pos = self.rel_pos_bias(rel_pos).permute(0, 3, 1, 2)
rel_pos = rel_pos.contiguous()
return rel_pos

def _cal_2d_pos_emb(self, hidden_states, bbox):
position_coord_x = bbox[:, :, 0]
position_coord_y = bbox[:, :, 3]
rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1)
rel_pos_x = self.relative_position_bucket(
rel_pos_x_2d_mat,
num_buckets=self.rel_2d_pos_bins,
max_distance=self.max_rel_2d_pos,
)
rel_pos_y = self.relative_position_bucket(
rel_pos_y_2d_mat,
num_buckets=self.rel_2d_pos_bins,
max_distance=self.max_rel_2d_pos,
)
rel_pos_x = F.one_hot(rel_pos_x, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states)
rel_pos_y = F.one_hot(rel_pos_y, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states)
rel_pos_x = self.rel_pos_x_bias(rel_pos_x).permute(0, 3, 1, 2)
rel_pos_y = self.rel_pos_y_bias(rel_pos_y).permute(0, 3, 1, 2)
rel_pos_x = rel_pos_x.contiguous()
rel_pos_y = rel_pos_y.contiguous()
rel_2d_pos = rel_pos_x + rel_pos_y
return rel_2d_pos

def forward(
self,
hidden_states,
bbox=None,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
position_ids=None,
Hp=None,
Wp=None,
valid_span=None,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

next_decoder_cache = () if use_cache else None

rel_pos = self._cal_1d_pos_emb(hidden_states, position_ids, valid_span) if self.has_relative_attention_bias else None
rel_2d_pos = self._cal_2d_pos_emb(hidden_states, bbox) if self.has_spatial_attention_bias else None

if self.detection:
feat_out = {}
j = 0

for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)

layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None

if self.gradient_checkpointing and self.training:

if use_cache:
logger.warning(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False

def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs)
# return module(*inputs, past_key_value, output_attentions, rel_pos, rel_2d_pos)
# The above line will cause error:
# RuntimeError: Trying to backward through the graph a second time
# (or directly access saved tensors after they have already been freed).
return custom_forward

layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(layer_module),
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
rel_pos,
rel_2d_pos
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
)

hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
#改 - 将输出与box进行合并
boxlen = bbox.shape[1]
if self.detection and i in self.out_indices:
xp = hidden_states[:,boxlen:,:][:, -Hp*Wp:, :].permute(0, 2, 1).reshape(len(hidden_states), -1, Hp, Wp)
feat_out[self.out_features[j]] = self.ops[j](xp.contiguous())
if self.out_features[j] == "layer11":
feat_out[self.out_features[j]] = joinEmmbed(hidden_states[:,:boxlen,:],feat_out[self.out_features[j]],bbox)
j += 1

if self.detection:
return feat_out

if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)

if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)

def joinEmmbed(embedding_feat,CNNfeat,bbox):
batch_size, n, _ = bbox.size()
bbox = bbox.float() / 1000
#print(bbox)
CW ,CH = CNNfeat.shape[2],CNNfeat.shape[3]

bbox[:,:,0] = bbox[:,:,0] * CW
bbox[:,:,2] = bbox[:,:,2] * CW
bbox[:,:,1] = bbox[:,:,1] * CH
bbox[:,:,3] = bbox[:,:,3] * CH
bbox = bbox.long()
CNNfeat = CNNfeat.permute(0,2,3,1)
for batch_idx in range(batch_size):
for bbox_idx in range(n):
# 获取当前 bbox 的坐标
bx = bbox[batch_idx][bbox_idx].cpu().numpy()
if not np.all(bx == 0):
CNNfeat[batch_idx,bx[0]:bx[2] + 1,bx[1]:bx[3] + 1,:] += embedding_feat[batch_idx][bbox_idx]
else:
break
CNNfeat =CNNfeat.permute(0,3,1, 2)
return CNNfeat
class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
"""
"""

_keys_to_ignore_on_load_missing = [r"position_ids"]

# Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
def __init__(self, config, detection=False, out_features=None, image_only=False):
super().__init__(config)
self.config = config
assert not config.is_decoder and not config.add_cross_attention, \
"This version do not support decoder. Please refer to RoBERTa for implementation of is_decoder."
self.detection = detection
if not self.detection:
self.image_only = False
else:
assert config.visual_embed
self.image_only = image_only

if not self.image_only:
self.embeddings = LayoutLMv3Embeddings(config)
self.encoder = LayoutLMv3Encoder(config, detection=detection, out_features=out_features)

if config.visual_embed:
embed_dim = self.config.hidden_size
# use the default pre-training parameters for fine-tuning (e.g., input_size)
# when the input_size is larger in fine-tuning, we will interpolate the position embedding in forward
self.patch_embed = PatchEmbed(embed_dim=embed_dim)

patch_size = 16
size = int(self.config.input_size / patch_size)
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.pos_embed = nn.Parameter(torch.zeros(1, size * size + 1, embed_dim))
self.pos_drop = nn.Dropout(p=0.)

self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
self._init_visual_bbox(img_size=(size, size))

from functools import partial
norm_layer = partial(nn.LayerNorm, eps=1e-6)
self.norm = norm_layer(embed_dim)

self.init_weights()

def get_input_embeddings(self):
return self.embeddings.word_embeddings

def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value

def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)

def _init_visual_bbox(self, img_size=(14, 14), max_len=1000):
visual_bbox_x = torch.div(torch.arange(0, max_len * (img_size[1] + 1), max_len),
img_size[1], rounding_mode='trunc')
visual_bbox_y = torch.div(torch.arange(0, max_len * (img_size[0] + 1), max_len),
img_size[0], rounding_mode='trunc')
visual_bbox = torch.stack(
[
visual_bbox_x[:-1].repeat(img_size[0], 1),
visual_bbox_y[:-1].repeat(img_size[1], 1).transpose(0, 1),
visual_bbox_x[1:].repeat(img_size[0], 1),
visual_bbox_y[1:].repeat(img_size[1], 1).transpose(0, 1),
],
dim=-1,
).view(-1, 4)

cls_token_box = torch.tensor([[0 + 1, 0 + 1, max_len - 1, max_len - 1]])
self.visual_bbox = torch.cat([cls_token_box, visual_bbox], dim=0)

def _calc_visual_bbox(self, device, dtype, bsz): # , img_size=(14, 14), max_len=1000):
visual_bbox = self.visual_bbox.repeat(bsz, 1, 1)
visual_bbox = visual_bbox.to(device).type(dtype)
return visual_bbox

def forward_image(self, x):
if self.detection:
x = self.patch_embed(x, self.pos_embed[:, 1:, :] if self.pos_embed is not None else None)
else:
x = self.patch_embed(x)
batch_size, seq_len, _ = x.size()

cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
if self.pos_embed is not None and self.detection:
cls_tokens = cls_tokens + self.pos_embed[:, :1, :]

x = torch.cat((cls_tokens, x), dim=1)
if self.pos_embed is not None and not self.detection:
x = x + self.pos_embed
x = self.pos_drop(x)

x = self.norm(x)
return x

# Copied from transformers.models.bert.modeling_bert.BertModel.forward
def forward(
self,
input_ids=None,
bbox=None,
attention_mask=None,
token_type_ids=None,
valid_span=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
images=None,
):
r"""
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:

- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
use_cache (:obj:`bool`, `optional`):
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
decoding (see :obj:`past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

use_cache = False

# if input_ids is not None and inputs_embeds is not None:
# raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
if input_ids is not None:
input_shape = input_ids.size()
batch_size, seq_length = input_shape
device = input_ids.device
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
batch_size, seq_length = input_shape
device = inputs_embeds.device
elif images is not None:
batch_size = len(images)
device = images.device
else:
raise ValueError("You have to specify either input_ids or inputs_embeds or images")

if not self.image_only:
# past_key_values_length
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0

if attention_mask is None:
attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
# extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)

encoder_extended_attention_mask = None

# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

if not self.image_only:
if bbox is None:
bbox = torch.zeros(tuple(list(input_shape) + [4]), dtype=torch.long, device=device)

embedding_output = self.embeddings(
input_ids=input_ids,
bbox=bbox,
position_ids=position_ids,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
past_key_values_length=past_key_values_length,
)

final_bbox = final_position_ids = None
Hp = Wp = None
if images is not None:
patch_size = 16
Hp, Wp = int(images.shape[2] / patch_size), int(images.shape[3] / patch_size)
visual_emb = self.forward_image(images)
if self.detection:
visual_attention_mask = torch.ones((batch_size, visual_emb.shape[1]), dtype=torch.long, device=device)
if self.image_only:
attention_mask = visual_attention_mask
else:
#改
#attention_mask = visual_attention_mask
attention_mask = torch.cat([attention_mask, visual_attention_mask], dim=1)
elif self.image_only:
attention_mask = torch.ones((batch_size, visual_emb.shape[1]), dtype=torch.long, device=device)

if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
if self.config.has_spatial_attention_bias:
visual_bbox = self._calc_visual_bbox(device, dtype=torch.long, bsz=batch_size)
if self.image_only:
final_bbox = visual_bbox
else:
#改
#final_bbox = visual_bbox
final_bbox = torch.cat([bbox, visual_bbox], dim=1)

visual_position_ids = torch.arange(0, visual_emb.shape[1], dtype=torch.long, device=device).repeat(
batch_size, 1)
if self.image_only:
final_position_ids = visual_position_ids
else:
#改
#final_position_ids = visual_position_ids
position_ids = torch.arange(0, input_shape[1], device=device).unsqueeze(0)
position_ids = position_ids.expand_as(input_ids)
final_position_ids = torch.cat([position_ids, visual_position_ids], dim=1)

if self.image_only:
embedding_output = visual_emb
else:
#embedding_output = self.LayerNorm(embedding_output)
#embedding_output = self.dropout(embedding_output)
#embedding_feat = embedding_output.clone()
#这里的emmbedding得改
###########################mycode#################################
#embedding_output = visual_emb
###########################mycode#################################
embedding_output = torch.cat([embedding_output, visual_emb], dim=1)
embedding_output = self.LayerNorm(embedding_output)
embedding_output = self.dropout(embedding_output)
elif self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
if self.config.has_spatial_attention_bias:
final_bbox = bbox
if self.config.has_relative_attention_bias:
position_ids = self.embeddings.position_ids[:, :input_shape[1]]
position_ids = position_ids.expand_as(input_ids)
final_position_ids = position_ids

extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, None, device)

encoder_outputs = self.encoder(
embedding_output,
bbox=bbox,
position_ids=final_position_ids,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
Hp=Hp,
Wp=Wp,
valid_span=valid_span,
)

if self.detection:
#改
#encoder_outputs['layer3'] = joinEmmbed(embedding_feat,encoder_outputs['layer3'],bbox)
#encoder_outputs['layer5'] = joinEmmbed(embedding_feat,encoder_outputs['layer5'],bbox)
#encoder_outputs['layer7'] = joinEmmbed(embedding_feat,encoder_outputs['layer7'],bbox)
#encoder_outputs['layer11'] = joinEmmbed(embedding_feat,encoder_outputs['layer11'],bbox)
return encoder_outputs

sequence_output = encoder_outputs[0]
pooled_output = None

if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]

return BaseModelOutputWithPoolingAndCrossAttentions(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
past_key_values=encoder_outputs.past_key_values,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
cross_attentions=encoder_outputs.cross_attentions,
)


class LayoutLMv3ClassificationHead(nn.Module):
"""
Head for sentence-level classification tasks.
Reference: RobertaClassificationHead
"""

def __init__(self, config, pool_feature=False):
super().__init__()
self.pool_feature = pool_feature
if pool_feature:
self.dense = nn.Linear(config.hidden_size*3, config.hidden_size)
else:
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

def forward(self, x):
# x = features[:, 0, :] # take <s> token (equiv. to [CLS])
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x


class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]

def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

self.layoutlmv3 = LayoutLMv3Model(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
if config.num_labels < 10:
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
else:
self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)

self.init_weights()

def forward(
self,
input_ids=None,
bbox=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
valid_span=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
images=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
1]``.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

outputs = self.layoutlmv3(
input_ids,
bbox=bbox,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
images=images,
valid_span=valid_span,
)

sequence_output = outputs[0]

sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)

loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)
active_labels = torch.where(
active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
)
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output

return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)


class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]

def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels

self.layoutlmv3 = LayoutLMv3Model(config)
# self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.qa_outputs = LayoutLMv3ClassificationHead(config, pool_feature=False)

self.init_weights()

def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
valid_span=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
bbox=None,
images=None,
):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

outputs = self.layoutlmv3(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
bbox=bbox,
images=images,
valid_span=valid_span,
)

sequence_output = outputs[0]

logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()

total_loss = None
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)

loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2

if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output

return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)


class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]

def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.layoutlmv3 = LayoutLMv3Model(config)
self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)

self.init_weights()

def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
valid_span=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
bbox=None,
images=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

outputs = self.layoutlmv3(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
bbox=bbox,
images=images,
valid_span=valid_span,
)

sequence_output = outputs[0][:, 0, :]
logits = self.classifier(sequence_output)

loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"

if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)

if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output

return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)

9.模型架构

#图像输入大小(3,W,H),经过patchEmmbedding->(W / 16, H / 16, 768) ->(w * h, 768)
#加上 visualEmmbedding(1 + w * h , 768) = position Emmbedding(1,768)+patchEmmbedding( w * h , 768)
#使用文本以及box编码输入transformer cancat【textWithBoxEmmbedding(wordsizeN) ,visualEmmbedding(1 + w * h , 768)】
#输出【textWithBoxEmmbedding ,visualEmmbedding】-> 3, 5, 7,11层的特征d
#将visualEmmbedding按照w * h 恢复成(w , h, 768)
#按照bbox将textWithBoxEmmbedding对应框的特征加到visualEmmbedding中
#再将输出的5个特征图做FPN -> Cascade RCNN结构图




#可以考虑加上模态融合模块,也就是视觉部分和文本部分相加的时候的大小
#

10.结果分析

CDLA数据集

模型TextTitleFigureFigure captionTableTable captionHeaderFooterReferenceEquationmAp
yolov5x6u95.692.991.191.198.494.293.675.395.791.091.9
layoutLmv393.391.383.688.797.692.991.082.793.279.189.3
layoutLmv3+按照两栏处理92.591.682.387.897.892.794.183.794.478.189.5
layoutLmv3+单双栏区分94.391.884.289.697.093.094.083.894.377.990.0
layoutLmv3+单双栏区分+所有本文92.391.783.388.997.393.294.683.894.878.289.8
layoutLmv3+单双栏区分+去除unkown的token93.691.683.489.096.992.794.183.594.477.689.7
layoutLmv3+单双栏区分+text特征部分融合(VSR方式)94.392.384.790.597.193.094.383.794.878.590.3
layoutLmv3+单双栏区分+text特征部分融合(使用CNN权重矩阵融合)92.892.084.889.597.693.194.683.995.581.590.5

CDLA数据集

模型TextTitleFigureFigure captionTableTable captionHeaderFooterReferenceEquationmAp
yolov5x6u95.692.991.191.198.494.293.675.395.791.091.9
layoutLMv393.391.383.688.797.692.991.082.793.279.189.3
layoutLMv3+WE-All92.391.783.388.997.393.294.683.894.878.289.8
layoutLMv3+WE-F394.391.884.289.697.093.094.083.894.377.990.0
layoutLMv3+WE-F3+AF194.392.384.790.597.193.094.383.794.878.590.3
layoutLMv3+WE-F3+AF292.892.084.889.597.693.194.683.995.581.590.5
layoutLMv3+SE+AF293.691.683.489.097.092.694.183.494.377.689.6
layoutLMv3+WE-F3+AF2+RM92.191.783.688.397.293.094.583.395.379.589.9
layoutLMv3+WE-F3+AF2+RM(FPN-768d + Attention)92.791.483.488.696.793.193.682.994.779.889.7
layoutLMv3+WE-F3+AF2+WIOU-loss92.591.382.585.898.093.892.382.493.778.989.1
layoutLMv3+WE-All+AF292.191.984.289.197.293.294.783.795.281.090.2
layoutLMv3+WE-F3+AF2+SoftMax92.792.084.989.597.893.794.683.995.581.790.6
layoutLMv3+WE-All+AF1(FPN前相加最顶层加)+SoftMax 92.491.983.288.497.193.694.183.295.279.689.9
layoutLMv3+WE-All+AF1(FPN后相加)+SoftMax 92.692.083.188.697.193.694.283.295.379.790.0
layoutLMv3+WE-All+AF1(FPN后相加) + (AVG MAX pooling特征融合)93.092.384.389.197.292.393.183.194.879.389.9
layoutLMv3+WE-All+AF1(Transformer后FPN前相加)93.491.486.190.398.292.590.681.492.078.489.5

关系模块后处理:将输出的特征通过ROI感兴趣区域提取出来,加上boundingbox pos embeding输入attention层。输出对应的xywh,score以及classscore。

新数据集

模型Text(PrecisionTable(PrecisionFigure(PrecisionHeader(PrecisionFooter(PrecisionmAp50-95
yolov5x6u87.898.381.786.772.885.5
layoutLMv380.595.667.882.665.078.3
layoutLMv3+WE-F3+AF282.295.473.580.668.080.0
layoutLMv3+WE-F3+AF2 + SoftMax81.796.474.683.269.381.0

Pulaynet数据集(英文数据集)

模型TextTitleListTableFiguremAp50-95
yolov5x6u95.791.296.598.398.396.0
VSR96.793.194.797.496.495.7
layoutLMv394.590.695.597.997.095.1
layoutLMv3+WE-F3+AF297.191.094.797.797.095.5
layoutLMv3+WE-F3+AF2+SoftMax97.091.094.797.697.195.5

YOLOV9

模型TextTitleFigureFigure captionTableTable captionHeaderFooterReferenceEquationmAp
yolov995.895.089.592.398.795.990.579.895.792.792.6
yolov9 + 文本(按照行分词embedding)94.390.286.788.198.791.886.973.793.487.789.1

文本训练-前期很不稳定:

image-20240428091028034

11.融合方式

#1.通道维度融合
def joinEmmbed2(embedding_feat,CNNfeat,bbox):
batch_size, n, _ = bbox.size()
bbox = bbox.float() / 1000
#print(bbox)
CW ,CH = CNNfeat.shape[2],CNNfeat.shape[3]
text_tensor = torch.zeros_like(CNNfeat).to(embedding_feat.device)
text_tensor = text_tensor.permute(0,2,3,1)

bbox[:,:,0] = bbox[:,:,0] * CW
bbox[:,:,2] = bbox[:,:,2] * CW
bbox[:,:,1] = bbox[:,:,1] * CH
bbox[:,:,3] = bbox[:,:,3] * CH
bbox = bbox.long()
CNNfeat = CNNfeat.permute(0,2,3,1)
for batch_idx in range(batch_size):
for bbox_idx in range(n):
# 获取当前 bbox 的坐标
bx = bbox[batch_idx][bbox_idx].cpu().numpy()
if not np.all(bx == 0):
text_tensor[batch_idx,bx[0]:bx[2] + 1,bx[1]:bx[3] + 1,:] += embedding_feat[batch_idx][bbox_idx]
else:
break
text_tensor =text_tensor.permute(0,3,1, 2)
CNNfeat =CNNfeat.permute(0,3,1, 2)
CNNfeat = torch.cat((CNNfeat, text_tensor), dim=1)
return CNNfeat

in_features=in_features * 2, # in_features=in_features









#2.backbone 多模态融合
import torch
import torch.nn as nn

# 定义包含卷积层的网络
class MyNetwork(nn.Module):
def __init__(self):
super(MyNetwork, self).__init__()
self.conv_layers = nn.Sequential(
nn.Conv2d(in_channels=768, out_channels=1, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
# 可以添加更多卷积层或其他层
)
self.sigmoid = nn.Sigmoid()

def forward(self, x):
x = self.conv_layers(x)
x = self.sigmoid(x)
return x

# 创建一个实例
model = MyNetwork()

# 创建一个大小为(B,C,W,H)的张量
input_tensor = torch.randn((4, 768, 64, 64))

# 使用模型进行前向传播
output_tensor = model(input_tensor)

# 打印结果的大小
print(output_tensor.size())

def joinEmmbed(embedding_feat,CNNfeat,bbox):
orgCNNFT = CNNfeat.clone()
batch_size, n, _ = bbox.size()
bbox = bbox.float() / 1000
#print(bbox)
CW ,CH = CNNfeat.shape[2],CNNfeat.shape[3]

bbox[:,:,0] = bbox[:,:,0] * CW
bbox[:,:,2] = bbox[:,:,2] * CW
bbox[:,:,1] = bbox[:,:,1] * CH
bbox[:,:,3] = bbox[:,:,3] * CH
bbox = bbox.long()
text_tensor = torch.zeros_like(CNNfeat).to(embedding_feat.device)
text_tensor = text_tensor.permute(0,2,3,1)

CNNfeat = CNNfeat.permute(0,2,3,1)
for batch_idx in range(batch_size):
for bbox_idx in range(n):
# 获取当前 bbox 的坐标
bx = bbox[batch_idx][bbox_idx].cpu().numpy()
if not np.all(bx == 0):
CNNfeat[batch_idx,bx[0]:bx[2] + 1,bx[1]:bx[3] + 1,:] += embedding_feat[batch_idx][bbox_idx]
text_tensor[batch_idx,bx[0]:bx[2] + 1,bx[1]:bx[3] + 1,:] += embedding_feat[batch_idx][bbox_idx]
else:
break
CNNfeat =CNNfeat.permute(0,3,1, 2)
text_tensor =text_tensor.permute(0,3,1, 2)
return CNNfeat,orgCNNFT,text_tensor

CNNfeat,orgCNNFT,text_tensor = joinEmmbed(hidden_states[:,:boxlen,:],feat_out[self.out_features[j]],bbox)
CNNfeat = self.ScoreNetwork(CNNfeat)
feat_out[self.out_features[j]] = CNNfeat * orgCNNFT + (1 - CNNfeat) * text_tensor

layoutLmv3+单双栏区分+文本特征在分类头池化层融合) 91.9 92.0 83.7 87.0 96.5 93.3 94.3 83.9 93.7 79.1 89.5

12.参数

 
# NOTE: given the new config system
# (https://detectron2.readthedocs.io/en/latest/tutorials/lazyconfigs.html),
# we will stop adding new functionalities to default CfgNode.

# -----------------------------------------------------------------------------
# Convention about Training / Test specific parameters
# -----------------------------------------------------------------------------
# Whenever an argument can be either used for training or for testing, the
# corresponding name will be post-fixed by a _TRAIN for a training parameter,
# or _TEST for a test-specific parameter.
# For example, the number of images during training will be
# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
# IMAGES_PER_BATCH_TEST

# -----------------------------------------------------------------------------
# Config definition
# -----------------------------------------------------------------------------

_C = CN()

#detectron2的版本
_C.VERSION = 2

_C.MODEL = CN()
_C.MODEL.LOAD_PROPOSALS = False #Fast-Rcnn中的参数
_C.MODEL.MASK_ON = False #是否是分割任务
_C.MODEL.KEYPOINT_ON = False #是否是关键点检测任务
_C.MODEL.DEVICE = "cuda"
_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN" #模型的建筑结构,可以理解成模型的结构

_C.MODEL.WEIGHTS = "" #模型的权重文件,可以是离线的,也可以是url

# Values to be used for image normalization (BGR order, since INPUT.FORMAT defaults to BGR).
# To train on images of different number of channels, just set different mean & std.
# Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
_C.MODEL.PIXEL_MEAN = [103.530, 116.280, 123.675] #输入图像均值
# When using pre-trained models in Detectron1 or any MSRA models,
# std has been absorbed into its conv1 weights, so the std needs to be set 1.
# Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
_C.MODEL.PIXEL_STD = [1.0, 1.0, 1.0] #输入图像标准差


# -----------------------------------------------------------------------------
# INPUT
# -----------------------------------------------------------------------------
_C.INPUT = CN()
# By default, {MIN,MAX}_SIZE options are used in transforms.ResizeShortestEdge.
# Please refer to ResizeShortestEdge for detailed definition.
# Size of the smallest side of the image during training
_C.INPUT.MIN_SIZE_TRAIN = (800,) #输入图像width,lenght最小尺寸
# Sample size of smallest side by choice or random selection from range give by
# INPUT.MIN_SIZE_TRAIN
_C.INPUT.MIN_SIZE_TRAIN_SAMPLING = "choice"
# Maximum size of the side of the image during training
_C.INPUT.MAX_SIZE_TRAIN = 1333
# Size of the smallest side of the image during testing. Set to zero to disable resize in testing.
_C.INPUT.MIN_SIZE_TEST = 800
# Maximum size of the side of the image during testing
_C.INPUT.MAX_SIZE_TEST = 1333
# Mode for flipping images used in data augmentation during training
# choose one of ["horizontal, "vertical", "none"]
_C.INPUT.RANDOM_FLIP = "horizontal"

# `True` if cropping is used for data augmentation during training
_C.INPUT.CROP = CN({"ENABLED": False}) #是否裁剪
# Cropping type. See documentation of `detectron2.data.transforms.RandomCrop` for explanation.
_C.INPUT.CROP.TYPE = "relative_range" #裁剪类别
# Size of crop in range (0, 1] if CROP.TYPE is "relative" or "relative_range" and in number of
# pixels if CROP.TYPE is "absolute"
_C.INPUT.CROP.SIZE = [0.9, 0.9]


# Whether the model needs RGB, YUV, HSV etc.
# Should be one of the modes defined here, as we use PIL to read the image:
# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes
# with BGR being the one exception. One can set image format to BGR, we will
# internally use RGB for conversion and flip the channels over
_C.INPUT.FORMAT = "BGR"
# The ground truth mask format that the model will use.
# Mask R-CNN supports either "polygon" or "bitmask" as ground truth.
_C.INPUT.MASK_FORMAT = "polygon" # alternative: "bitmask"


# -----------------------------------------------------------------------------
# Dataset
# -----------------------------------------------------------------------------
_C.DATASETS = CN()
# List of the dataset names for training. Must be registered in DatasetCatalog
# Samples from these datasets will be merged and used as one dataset.
_C.DATASETS.TRAIN = () #训练集注册名列表
# List of the pre-computed proposal files for training, which must be consistent
# with datasets listed in DATASETS.TRAIN.
_C.DATASETS.PROPOSAL_FILES_TRAIN = ()
# Number of top scoring precomputed proposals to keep for training
_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN = 2000
# List of the dataset names for testing. Must be registered in DatasetCatalog
_C.DATASETS.TEST = ()
# List of the pre-computed proposal files for test, which must be consistent
# with datasets listed in DATASETS.TEST.
_C.DATASETS.PROPOSAL_FILES_TEST = ()
# Number of top scoring precomputed proposals to keep for test
_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST = 1000

# -----------------------------------------------------------------------------
# DataLoader
# -----------------------------------------------------------------------------
_C.DATALOADER = CN()
# Number of data loading threads
_C.DATALOADER.NUM_WORKERS = 0
# If True, each batch should contain only images for which the aspect ratio
# is compatible. This groups portrait images together, and landscape images
# are not batched with portrait images.
_C.DATALOADER.ASPECT_RATIO_GROUPING = True
# Options: TrainingSampler, RepeatFactorTrainingSampler
_C.DATALOADER.SAMPLER_TRAIN = "TrainingSampler" #训练样本采样方法
# Repeat threshold for RepeatFactorTrainingSampler
_C.DATALOADER.REPEAT_THRESHOLD = 0.0 #样本重复比率
# Tf True, when working on datasets that have instance annotations, the
# training dataloader will filter out images without associated annotations
_C.DATALOADER.FILTER_EMPTY_ANNOTATIONS = True #过滤标注为空的数据集

# ---------------------------------------------------------------------------- #
# Backbone options
# ---------------------------------------------------------------------------- #
_C.MODEL.BACKBONE = CN()

_C.MODEL.BACKBONE.NAME = "build_resnet_backbone" #骨架网络结构
# Freeze the first several stages so they are not trained.
# There are 5 stages in ResNet. The first is a convolution, and the following
# stages are each group of residual blocks.
_C.MODEL.BACKBONE.FREEZE_AT = 2 #从第几层开始冻结骨干网络权重


# ---------------------------------------------------------------------------- #
# FPN options
# ---------------------------------------------------------------------------- #
_C.MODEL.FPN = CN()
# Names of the input feature maps to be used by FPN
# They must have contiguous power of 2 strides
# e.g., ["res2", "res3", "res4", "res5"]
_C.MODEL.FPN.IN_FEATURES = [] #FPN网络输入来源于骨干网络的哪些输出
_C.MODEL.FPN.OUT_CHANNELS = 256 #FPN网络输出通道数

# Options: "" (no norm), "GN"
_C.MODEL.FPN.NORM = ""

# Types for fusing the FPN top-down and lateral features. Can be either "sum" or "avg"
_C.MODEL.FPN.FUSE_TYPE = "sum" #top-down融合方法


# ---------------------------------------------------------------------------- #
# Proposal generator options
# ---------------------------------------------------------------------------- #
_C.MODEL.PROPOSAL_GENERATOR = CN()
# Current proposal generators include "RPN", "RRPN" and "PrecomputedProposals"
_C.MODEL.PROPOSAL_GENERATOR.NAME = "RPN"
# Proposal height and width both need to be greater than MIN_SIZE
# (a the scale used during training or inference)
_C.MODEL.PROPOSAL_GENERATOR.MIN_SIZE = 0


# ---------------------------------------------------------------------------- #
# Anchor generator options
# ---------------------------------------------------------------------------- #
_C.MODEL.ANCHOR_GENERATOR = CN()
# The generator can be any name in the ANCHOR_GENERATOR registry
_C.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator"
# Anchor sizes (i.e. sqrt of area) in absolute pixels w.r.t. the network input.
# Format: list[list[float]]. SIZES[i] specifies the list of sizes to use for
# IN_FEATURES[i]; len(SIZES) must be equal to len(IN_FEATURES) or 1.
# When len(SIZES) == 1, SIZES[0] is used for all IN_FEATURES.
_C.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64, 128, 256, 512]]
# Anchor aspect ratios. For each area given in `SIZES`, anchors with different aspect
# ratios are generated by an anchor generator.
# Format: list[list[float]]. ASPECT_RATIOS[i] specifies the list of aspect ratios (H/W)
# to use for IN_FEATURES[i]; len(ASPECT_RATIOS) == len(IN_FEATURES) must be true,
# or len(ASPECT_RATIOS) == 1 is true and aspect ratio list ASPECT_RATIOS[0] is used
# for all IN_FEATURES.
_C.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.5, 1.0, 2.0]]
# Anchor angles.
# list[list[float]], the angle in degrees, for each input feature map.
# ANGLES[i] specifies the list of angles for IN_FEATURES[i].
_C.MODEL.ANCHOR_GENERATOR.ANGLES = [[-90, 0, 90]]
# Relative offset between the center of the first anchor and the top-left corner of the image
# Value has to be in [0, 1). Recommend to use 0.5, which means half stride.
# The value is not expected to affect model accuracy.
_C.MODEL.ANCHOR_GENERATOR.OFFSET = 0.0

# ---------------------------------------------------------------------------- #
# RPN options
# ---------------------------------------------------------------------------- #
_C.MODEL.RPN = CN()
_C.MODEL.RPN.HEAD_NAME = "StandardRPNHead" # used by RPN_HEAD_REGISTRY

# Names of the input feature maps to be used by RPN
# e.g., ["p2", "p3", "p4", "p5", "p6"] for FPN
_C.MODEL.RPN.IN_FEATURES = ["res4"]
# Remove RPN anchors that go outside the image by BOUNDARY_THRESH pixels
# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
_C.MODEL.RPN.BOUNDARY_THRESH = -1
# IOU overlap ratios [BG_IOU_THRESHOLD, FG_IOU_THRESHOLD]
# Minimum overlap required between an anchor and ground-truth box for the
# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
# ==> positive RPN example: 1)
# Maximum overlap allowed between an anchor and ground-truth box for the
# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
# ==> negative RPN example: 0)
# Anchors with overlap in between (BG_IOU_THRESHOLD <= IoU < FG_IOU_THRESHOLD)
# are ignored (-1)
_C.MODEL.RPN.IOU_THRESHOLDS = [0.3, 0.7]
_C.MODEL.RPN.IOU_LABELS = [0, -1, 1]
# Number of regions per image used to train RPN
_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
# Target fraction of foreground (positive) examples per RPN minibatch
_C.MODEL.RPN.POSITIVE_FRACTION = 0.5
# Options are: "smooth_l1", "giou", "diou", "ciou"
_C.MODEL.RPN.BBOX_REG_LOSS_TYPE = "smooth_l1"
_C.MODEL.RPN.BBOX_REG_LOSS_WEIGHT = 1.0
# Weights on (dx, dy, dw, dh) for normalizing RPN anchor regression targets
_C.MODEL.RPN.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
_C.MODEL.RPN.SMOOTH_L1_BETA = 0.0
_C.MODEL.RPN.LOSS_WEIGHT = 1.0
# Number of top scoring RPN proposals to keep before applying NMS
# When FPN is used, this is *per FPN level* (not total)
_C.MODEL.RPN.PRE_NMS_TOPK_TRAIN = 12000
_C.MODEL.RPN.PRE_NMS_TOPK_TEST = 6000
# Number of top scoring RPN proposals to keep after applying NMS
# When FPN is used, this limit is applied per level and then again to the union
# of proposals from all levels
# NOTE: When FPN is used, the meaning of this config is different from Detectron1.
# It means per-batch topk in Detectron1, but per-image topk here.
# See the "find_top_rpn_proposals" function for details.
_C.MODEL.RPN.POST_NMS_TOPK_TRAIN = 2000
_C.MODEL.RPN.POST_NMS_TOPK_TEST = 1000
# NMS threshold used on RPN proposals
_C.MODEL.RPN.NMS_THRESH = 0.7
# Set this to -1 to use the same number of output channels as input channels.
_C.MODEL.RPN.CONV_DIMS = [-1]

# ---------------------------------------------------------------------------- #
# ROI HEADS options
# ---------------------------------------------------------------------------- #
_C.MODEL.ROI_HEADS = CN()
_C.MODEL.ROI_HEADS.NAME = "Res5ROIHeads"
# Number of foreground classes
_C.MODEL.ROI_HEADS.NUM_CLASSES = 80
# Names of the input feature maps to be used by ROI heads
# Currently all heads (box, mask, ...) use the same input feature map list
# e.g., ["p2", "p3", "p4", "p5"] is commonly used for FPN
_C.MODEL.ROI_HEADS.IN_FEATURES = ["res4"]
# IOU overlap ratios [IOU_THRESHOLD]
# Overlap threshold for an RoI to be considered background (if < IOU_THRESHOLD)
# Overlap threshold for an RoI to be considered foreground (if >= IOU_THRESHOLD)
_C.MODEL.ROI_HEADS.IOU_THRESHOLDS = [0.5]
_C.MODEL.ROI_HEADS.IOU_LABELS = [0, 1]
# RoI minibatch size *per image* (number of regions of interest [ROIs]) during training
# Total number of RoIs per training minibatch =
# ROI_HEADS.BATCH_SIZE_PER_IMAGE * SOLVER.IMS_PER_BATCH
# E.g., a common configuration is: 512 * 16 = 8192
_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25

# Only used on test mode

# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
# balance obtaining high recall with not having too many low precision
# detections that will slow down inference post processing steps (like NMS)
# A default threshold of 0.0 increases AP by ~0.2-0.3 but significantly slows down
# inference.
_C.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.05
# Overlap threshold used for non-maximum suppression (suppress boxes with
# IoU >= this threshold)
_C.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.5
# If True, augment proposals with ground-truth boxes before sampling proposals to
# train ROI heads.
_C.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT = True

# ---------------------------------------------------------------------------- #
# Box Head
# ---------------------------------------------------------------------------- #
_C.MODEL.ROI_BOX_HEAD = CN()
# C4 don't use head name option
# Options for non-C4 models: FastRCNNConvFCHead,
_C.MODEL.ROI_BOX_HEAD.NAME = ""
# Options are: "smooth_l1", "giou", "diou", "ciou"
_C.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE = "smooth_l1"
# The final scaling coefficient on the box regression loss, used to balance the magnitude of its
# gradients with other losses in the model. See also `MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT`.
_C.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT = 1.0
# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
# These are empirically chosen to approximately lead to unit variance targets
_C.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0)
# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
_C.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA = 0.0
_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
# Type of pooling operation applied to the incoming feature map for each RoI
_C.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2"

_C.MODEL.ROI_BOX_HEAD.NUM_FC = 0
# Hidden layer dimension for FC layers in the RoI box head
_C.MODEL.ROI_BOX_HEAD.FC_DIM = 1024
_C.MODEL.ROI_BOX_HEAD.NUM_CONV = 0
# Channel dimension for Conv layers in the RoI box head
_C.MODEL.ROI_BOX_HEAD.CONV_DIM = 256
# Normalization method for the convolution layers.
# Options: "" (no norm), "GN", "SyncBN".
_C.MODEL.ROI_BOX_HEAD.NORM = ""
# Whether to use class agnostic for bbox regression
_C.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG = False
# If true, RoI heads use bounding boxes predicted by the box head rather than proposal boxes.
_C.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES = False

# ---------------------------------------------------------------------------- #
# Cascaded Box Head
# ---------------------------------------------------------------------------- #
_C.MODEL.ROI_BOX_CASCADE_HEAD = CN()
# The number of cascade stages is implicitly defined by the length of the following two configs.
_C.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS = (
(10.0, 10.0, 5.0, 5.0),
(20.0, 20.0, 10.0, 10.0),
(30.0, 30.0, 15.0, 15.0),
)
_C.MODEL.ROI_BOX_CASCADE_HEAD.IOUS = (0.5, 0.6, 0.7)


# ---------------------------------------------------------------------------- #
# Mask Head
# ---------------------------------------------------------------------------- #
_C.MODEL.ROI_MASK_HEAD = CN()
_C.MODEL.ROI_MASK_HEAD.NAME = "MaskRCNNConvUpsampleHead"
_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
_C.MODEL.ROI_MASK_HEAD.NUM_CONV = 0 # The number of convs in the mask head
_C.MODEL.ROI_MASK_HEAD.CONV_DIM = 256
# Normalization method for the convolution layers.
# Options: "" (no norm), "GN", "SyncBN".
_C.MODEL.ROI_MASK_HEAD.NORM = ""
# Whether to use class agnostic for mask prediction
_C.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK = False
# Type of pooling operation applied to the incoming feature map for each RoI
_C.MODEL.ROI_MASK_HEAD.POOLER_TYPE = "ROIAlignV2"


# ---------------------------------------------------------------------------- #
# Keypoint Head
# ---------------------------------------------------------------------------- #
_C.MODEL.ROI_KEYPOINT_HEAD = CN()
_C.MODEL.ROI_KEYPOINT_HEAD.NAME = "KRCNNConvDeconvUpsampleHead"
_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
_C.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS = tuple(512 for _ in range(8))
_C.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS = 17 # 17 is the number of keypoints in COCO.

# Images with too few (or no) keypoints are excluded from training.
_C.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE = 1
# Normalize by the total number of visible keypoints in the minibatch if True.
# Otherwise, normalize by the total number of keypoints that could ever exist
# in the minibatch.
# The keypoint softmax loss is only calculated on visible keypoints.
# Since the number of visible keypoints can vary significantly between
# minibatches, this has the effect of up-weighting the importance of
# minibatches with few visible keypoints. (Imagine the extreme case of
# only one visible keypoint versus N: in the case of N, each one
# contributes 1/N to the gradient compared to the single keypoint
# determining the gradient direction). Instead, we can normalize the
# loss by the total number of keypoints, if it were the case that all
# keypoints were visible in a full minibatch. (Returning to the example,
# this means that the one visible keypoint contributes as much as each
# of the N keypoints.)
_C.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS = True
# Multi-task loss weight to use for keypoints
# Recommended values:
# - use 1.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is True
# - use 4.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is False
_C.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT = 1.0
# Type of pooling operation applied to the incoming feature map for each RoI
_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE = "ROIAlignV2"

# ---------------------------------------------------------------------------- #
# Semantic Segmentation Head
# ---------------------------------------------------------------------------- #
_C.MODEL.SEM_SEG_HEAD = CN()
_C.MODEL.SEM_SEG_HEAD.NAME = "SemSegFPNHead"
_C.MODEL.SEM_SEG_HEAD.IN_FEATURES = ["p2", "p3", "p4", "p5"]
# Label in the semantic segmentation ground truth that is ignored, i.e., no loss is calculated for
# the correposnding pixel.
_C.MODEL.SEM_SEG_HEAD.IGNORE_VALUE = 255
# Number of classes in the semantic segmentation head
_C.MODEL.SEM_SEG_HEAD.NUM_CLASSES = 54
# Number of channels in the 3x3 convs inside semantic-FPN heads.
_C.MODEL.SEM_SEG_HEAD.CONVS_DIM = 128
# Outputs from semantic-FPN heads are up-scaled to the COMMON_STRIDE stride.
_C.MODEL.SEM_SEG_HEAD.COMMON_STRIDE = 4
# Normalization method for the convolution layers. Options: "" (no norm), "GN".
_C.MODEL.SEM_SEG_HEAD.NORM = "GN"
_C.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT = 1.0

_C.MODEL.PANOPTIC_FPN = CN()
# Scaling of all losses from instance detection / segmentation head.
_C.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT = 1.0

# options when combining instance & semantic segmentation outputs
_C.MODEL.PANOPTIC_FPN.COMBINE = CN({"ENABLED": True}) # "COMBINE.ENABLED" is deprecated & not used
_C.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH = 0.5
_C.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT = 4096
_C.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = 0.5


# ---------------------------------------------------------------------------- #
# RetinaNet Head
# ---------------------------------------------------------------------------- #
_C.MODEL.RETINANET = CN()

# This is the number of foreground classes.
_C.MODEL.RETINANET.NUM_CLASSES = 80

_C.MODEL.RETINANET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]

# Convolutions to use in the cls and bbox tower
# NOTE: this doesn't include the last conv for logits
_C.MODEL.RETINANET.NUM_CONVS = 4

# IoU overlap ratio [bg, fg] for labeling anchors.
# Anchors with < bg are labeled negative (0)
# Anchors with >= bg and < fg are ignored (-1)
# Anchors with >= fg are labeled positive (1)
_C.MODEL.RETINANET.IOU_THRESHOLDS = [0.4, 0.5]
_C.MODEL.RETINANET.IOU_LABELS = [0, -1, 1]

# Prior prob for rare case (i.e. foreground) at the beginning of training.
# This is used to set the bias for the logits layer of the classifier subnet.
# This improves training stability in the case of heavy class imbalance.
_C.MODEL.RETINANET.PRIOR_PROB = 0.01

# Inference cls score threshold, only anchors with score > INFERENCE_TH are
# considered for inference (to improve speed)
_C.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05
# Select topk candidates before NMS
_C.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000
_C.MODEL.RETINANET.NMS_THRESH_TEST = 0.5

# Weights on (dx, dy, dw, dh) for normalizing Retinanet anchor regression targets
_C.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)

# Loss parameters
_C.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0
_C.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25
_C.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.1
# Options are: "smooth_l1", "giou", "diou", "ciou"
_C.MODEL.RETINANET.BBOX_REG_LOSS_TYPE = "smooth_l1"

# One of BN, SyncBN, FrozenBN, GN
# Only supports GN until unshared norm is implemented
_C.MODEL.RETINANET.NORM = ""


# ---------------------------------------------------------------------------- #
# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
# Note that parts of a resnet may be used for both the backbone and the head
# These options apply to both
# ---------------------------------------------------------------------------- #
_C.MODEL.RESNETS = CN()

_C.MODEL.RESNETS.DEPTH = 50
_C.MODEL.RESNETS.OUT_FEATURES = ["res4"] # res4 for C4 backbone, res2..5 for FPN backbone

# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
_C.MODEL.RESNETS.NUM_GROUPS = 1

# Options: FrozenBN, GN, "SyncBN", "BN"
_C.MODEL.RESNETS.NORM = "FrozenBN"

# Baseline width of each group.
# Scaling this parameters will scale the width of all bottleneck layers.
_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64

# Place the stride 2 conv on the 1x1 filter
# Use True only for the original MSRA ResNet; use False for C2 and Torch models
_C.MODEL.RESNETS.STRIDE_IN_1X1 = True

# Apply dilation in stage "res5"
_C.MODEL.RESNETS.RES5_DILATION = 1

# Output width of res2. Scaling this parameters will scale the width of all 1x1 convs in ResNet
# For R18 and R34, this needs to be set to 64
_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64

# Apply Deformable Convolution in stages
# Specify if apply deform_conv on Res2, Res3, Res4, Res5
_C.MODEL.RESNETS.DEFORM_ON_PER_STAGE = [False, False, False, False]
# Use True to use modulated deform_conv (DeformableV2, https://arxiv.org/abs/1811.11168);
# Use False for DeformableV1.
_C.MODEL.RESNETS.DEFORM_MODULATED = False
# Number of groups in deformable conv.
_C.MODEL.RESNETS.DEFORM_NUM_GROUPS = 1


# ---------------------------------------------------------------------------- #
# Solver
# ---------------------------------------------------------------------------- #
_C.SOLVER = CN()

# Options: WarmupMultiStepLR, WarmupCosineLR.
# See detectron2/solver/build.py for definition.
_C.SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR" #学习率调度策略

_C.SOLVER.MAX_ITER = 40000

_C.SOLVER.BASE_LR = 0.001
# The end lr, only used by WarmupCosineLR
_C.SOLVER.BASE_LR_END = 0.0

_C.SOLVER.MOMENTUM = 0.9

_C.SOLVER.NESTEROV = False

_C.SOLVER.WEIGHT_DECAY = 0.0001
# The weight decay that's applied to parameters of normalization layers
# (typically the affine transformation)
_C.SOLVER.WEIGHT_DECAY_NORM = 0.0

_C.SOLVER.GAMMA = 0.1
# The iteration number to decrease learning rate by GAMMA.
_C.SOLVER.STEPS = (30000,) #30000次迭代更新学习率

_C.SOLVER.WARMUP_FACTOR = 1.0 / 1000
_C.SOLVER.WARMUP_ITERS = 1000
_C.SOLVER.WARMUP_METHOD = "linear"

# Save a checkpoint after every this number of iterations
_C.SOLVER.CHECKPOINT_PERIOD = 5000

# Number of images per batch across all machines. This is also the number
# of training images per step (i.e. per iteration). If we use 16 GPUs
# and IMS_PER_BATCH = 32, each GPU will see 2 images per batch.
# May be adjusted automatically if REFERENCE_WORLD_SIZE is set.
_C.SOLVER.IMS_PER_BATCH = 16

# The reference number of workers (GPUs) this config is meant to train with.
# It takes no effect when set to 0.
# With a non-zero value, it will be used by DefaultTrainer to compute a desired
# per-worker batch size, and then scale the other related configs (total batch size,
# learning rate, etc) to match the per-worker batch size.
# See documentation of `DefaultTrainer.auto_scale_workers` for details:
_C.SOLVER.REFERENCE_WORLD_SIZE = 0

# Detectron v1 (and previous detection code) used a 2x higher LR and 0 WD for
# biases. This is not useful (at least for recent models). You should avoid
# changing these and they exist only to reproduce Detectron v1 training if
# desired.
_C.SOLVER.BIAS_LR_FACTOR = 1.0
_C.SOLVER.WEIGHT_DECAY_BIAS = None # None means following WEIGHT_DECAY

# Gradient clipping
_C.SOLVER.CLIP_GRADIENTS = CN({"ENABLED": False})
# Type of gradient clipping, currently 2 values are supported:
# - "value": the absolute values of elements of each gradients are clipped
# - "norm": the norm of the gradient for each parameter is clipped thus
# affecting all elements in the parameter
_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "value"
# Maximum absolute value used for clipping gradients
_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0
# Floating point number p for L-p norm to be used with the "norm"
# gradient clipping type; for L-inf, please specify .inf
_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0

# Enable automatic mixed precision for training
# Note that this does not change model's inference behavior.
# To use AMP in inference, run inference under autocast()
_C.SOLVER.AMP = CN({"ENABLED": False})

# ---------------------------------------------------------------------------- #
# Specific test options
# ---------------------------------------------------------------------------- #
_C.TEST = CN()
# For end-to-end tests to verify the expected accuracy.
# Each item is [task, metric, value, tolerance]
# e.g.: [['bbox', 'AP', 38.5, 0.2]]
_C.TEST.EXPECTED_RESULTS = []
# The period (in terms of steps) to evaluate the model during training.
# Set to 0 to disable.
_C.TEST.EVAL_PERIOD = 0 #多少次迭代评估一次
# The sigmas used to calculate keypoint OKS. See http://cocodataset.org/#keypoints-eval
# When empty, it will use the defaults in COCO.
# Otherwise it should be a list[float] with the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
_C.TEST.KEYPOINT_OKS_SIGMAS = []
# Maximum number of detections to return per image during inference (100 is
# based on the limit established for the COCO dataset).
_C.TEST.DETECTIONS_PER_IMAGE = 100 #最大目标检测数目

_C.TEST.AUG = CN({"ENABLED": False})
_C.TEST.AUG.MIN_SIZES = (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
_C.TEST.AUG.MAX_SIZE = 4000
_C.TEST.AUG.FLIP = True

_C.TEST.PRECISE_BN = CN({"ENABLED": False})
_C.TEST.PRECISE_BN.NUM_ITER = 200

# ---------------------------------------------------------------------------- #
# Misc options
# ---------------------------------------------------------------------------- #
# Directory where output files are written
_C.OUTPUT_DIR = "./output"
# Set seed to negative to fully randomize everything.
# Set seed to positive to use a fixed seed. Note that a fixed seed increases
# reproducibility but does not guarantee fully deterministic behavior.
# Disabling all parallelism further increases reproducibility.
_C.SEED = -1
# Benchmark different cudnn algorithms.
# If input images have very different sizes, this option will have large overhead
# for about 10k iterations. It usually hurts total time, but can benefit for certain models.
# If input images have the same or similar sizes, benchmark is often helpful.
_C.CUDNN_BENCHMARK = False
# The period (in terms of steps) for minibatch visualization at train time.
# Set to 0 to disable.
_C.VIS_PERIOD = 0

# global config is for quick hack purposes.
# You can set them in command line or config files,
# and access it with:
#
# from detectron2.config import global_cfg
# print(global_cfg.HACK)
#
# Do not commit any configs into it.
_C.GLOBAL = CN()
_C.GLOBAL.HACK = 1.0

13.调整参数

MODEL:
MASK_ON: True
IMAGE_ONLY: True
META_ARCHITECTURE: "VLGeneralizedRCNN"
PIXEL_MEAN: [ 127.5, 127.5, 127.5 ]
PIXEL_STD: [ 127.5, 127.5, 127.5 ]
WEIGHTS: "/data1/lxj/workspace/layout/model/layoutlmv3-base-chinese/model_final.pth"
BACKBONE:
NAME: "build_vit_fpn_backbone"
VIT:
NAME: "layoutlmv3_base"
OUT_FEATURES: [ "layer3", "layer5", "layer7", "layer11" ]
DROP_PATH: 0.1
IMG_SIZE: [ 448,448 ]
POS_TYPE: "abs"
ROI_HEADS:
NAME: CascadeROIHeads
IN_FEATURES: [ "p2", "p3", "p4", "p5" ]
NUM_CLASSES: 11
ROI_BOX_HEAD:
CLS_AGNOSTIC_BBOX_REG: True
NAME: "FastRCNNConvFCHead"
BBOX_REG_LOSS_TYPE : "ciou"
NUM_FC: 2
POOLER_RESOLUTION: 7
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 4
POOLER_RESOLUTION: 14
FPN:
IN_FEATURES: [ "layer3", "layer5", "layer7", "layer11" ]
ANCHOR_GENERATOR:
SIZES: [ [ 32 ], [ 64 ], [ 128 ], [ 256 ], [ 512 ] ] # One size for each in feature map
ASPECT_RATIOS: [ [ 0.5, 1.0, 2.0 ] ] # Three aspect ratios (same for all in feature maps)
RPN:
IN_FEATURES: [ "p2", "p3", "p4", "p5", "p6" ]
BBOX_REG_LOSS_TYPE : "ciou"

PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
# Detectron1 uses 2000 proposals per-batch,
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
POST_NMS_TOPK_TRAIN: 2000
POST_NMS_TOPK_TEST: 1000
DATASETS:
TRAIN: ("publaynet_train",)
TEST: ("publaynet_val",)
SOLVER:
GRADIENT_ACCUMULATION_STEPS: 1
BASE_LR: 0.0002
WARMUP_ITERS: 1000
IMS_PER_BATCH: 1
MAX_ITER: 30000 #30000
CHECKPOINT_PERIOD: 2000
LR_SCHEDULER_NAME: "WarmupCosineLR"
AMP:
ENABLED: True
OPTIMIZER: "ADAMW"
BACKBONE_MULTIPLIER: 1.0
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0
WARMUP_FACTOR: 0.01
WEIGHT_DECAY: 0.05
TEST:
EVAL_PERIOD: 2000
INPUT:
CROP:
ENABLED: True
TYPE: "absolute_range"
SIZE: (384, 600)
MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
FORMAT: "RGB"
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
VERSION: 2
AUG:
DETR: True
SEED: 42
OUTPUT_DIR: "/data1/lxj/workspace/layout/output/test3-7"
PUBLAYNET_DATA_DIR_TRAIN: "/path/to/data/PubLayNet/publaynet/train"
PUBLAYNET_DATA_DIR_TEST: "/path/to/data/PubLayNet/publaynet/val"
CACHE_DIR: "/path/to/cache/huggingface"