本文围绕智能导盲机器狗比赛的数据集与训练策略展开。数据集方面,介绍了train与val数据融合的方法及代码,还提及伪标签训练(半监督学习)的入门、进阶和创新版方式。训练策略上,针对PaddleDetection,给出了显卡数量、batchsize、优化器选择等方面的建议,并提供了含droppath的ConvNeXt代码实例。
☞☞☞AI 智能聊天, 问答助手, AI 智能搜索, 免费无限量使用 DeepSeek R1 模型☜☜☜

一只导盲犬能够给盲人带来许多生活上的便利,但是导盲犬的培训周期长,费用高昂,因此,不是所有盲人能够拥有导盲犬,如果有机器狗代替导盲犬,将极大的造福盲人,此项比赛为智能导盲机器狗比赛,通过比赛来考评智能导盲机器狗的智能感知能力及综合运动性能,要求智能四足仿生机器人沿布置好的城市人行道场景走完全程并完成指定任务。
其实说起来高大上,当我们一句看到具体的任务的时候就会发现,其实就是一个非常简单的目标检测任务在出塞中,赛事组提供五种不同的目标让你去进行识别,但是为了服务于现实场景, 其在模型大小 以及检测速度等方面均提出了要求模型大小限制在200兆以内检测速度要求不低于20FPS。
目前赛事已经快接近尾声了,相信大家应该已经确定自己的路线了,而且网络的改进应该也差不多了,接下来要选择的应该就是在数据集与训练策越方面开始各显神通了,那么现在就让我们一起在数据集与训练策略方面下下功夫吧
上集指路:导盲赛道思路分享
有些比赛是直接把一堆数据给你然后让你自己划分train与val,去进行训练,那么通常情况下选手会先划分train与val然后去进行训练然后一点点改进网络,当val达到最高值的时候,也就是确定好了网络架构,那么一般选手就会将val与train再融合到一起再去跑一遍最后最好的网络然后去提交,而本赛事中赛事组已经帮你划好了train与val然后用coco数据集给你确定下来了,这样你就免去了第一步,并且保证了train与val的分布均匀,使用coco格式对于你先期确定网络架构有很大帮助,但是如果你对数据处理比较薄弱,那么在将train与val合并的时候就会出问题,但是别担心我这里已经帮你做好了这个工作。
你只需要运行下面的代码就可以得到一个包含train与val所有数据的文件夹了
## 解压文件夹!tar -zxvf data/data137625/WisdomGuide.tar.gz## 安装所需第三方库!pip install lxml
!pip install pycocotools#coco2vocfrom pycocotools.coco import COCO
import os, cv2, shutilfrom lxml import etree, objectifyfrom tqdm import tqdmfrom PIL import Imageimport numpy as npimport timeimport jsondef cover_copy(src,dst):
'''
src和dst都必须是文件,该函数是执行覆盖操作
'''
if os.path.exists(dst):
os.remove(dst)
shutil.copy(src,dst) else:
shutil.copy(src,dst)def coco2voc(basedir='VOCdevkit/COCO_VOC',sourcedir='WisdomGuide'):
"""
basedir:用来存放转换后数据和标注文件
sourcedir:用来指定原始COCO数据集的存放位置
"""
img_savepath= os.path.join(basedir,'JPEGImages')
ann_savepath=os.path.join(basedir,'Annotations')
main_path = os.path.join(basedir,"annotations") for p in [basedir,img_savepath,ann_savepath,main_path]: if os.path.exists(p):
shutil.rmtree(p)
os.makedirs(p) else:
os.makedirs(p)
datasets = ['train','val'] # datasets = ['val2017']
for dataset in datasets:
start = time.time() print(f"start {dataset}")
no_ann=[] #用来存放没有标注数据的图片的id,并将这些图片复制到results文件夹中
not_rgb=[] #是灰度图,同样将其保存
annfile = 'instance_{}.json'.format(dataset)
annpath=os.path.join(sourcedir,'annotations',annfile)
print('loading annotations into memory...')
tic = time.time() with open(annpath, 'r') as f:
dataset_ann = json.load(f) assert type(
dataset_ann
) == dict, 'annotation file format {} not supported'.format( type(dataset)) print('Done (t={:0.2f}s)'.format(time.time() - tic))
coco = COCO(annpath)
classes = dict() for cat in coco.dataset['categories']:
classes[cat['id']] = cat['name']
imgIds = coco.getImgIds() # imgIds=imgIds[0:1000]#测试用,抽取10张图片,看下存储效果
for imgId in tqdm(imgIds):
img = coco.loadImgs(imgId)[0]
filename = img['file_name']
filepath=os.path.join(sourcedir,dataset,filename)
annIds = coco.getAnnIds(imgIds=img['id'], iscrowd=None)
anns = coco.loadAnns(annIds)
if not len(anns): # print(f"{dataset}:{imgId}该文件没有标注信息,将其复制到{dataset}_noann_result中,以使查看")
no_ann.append(imgId)
result_path = os.path.join(sourcedir,dataset+"_noann_result")
dest_path = os.path.join(result_path,filename) if not os.path.exists(result_path):
os.makedirs(result_path)
cover_copy(filepath,dest_path) continue #如果没有标注信息,则把没有标注信息的图片移动到相关结果文件 noann_result中,来进行查看 ,然后返回做下一张图
#有标注信息,接着往下走,获取标注信息
objs = [] for ann in anns:
name = classes[ann['category_id']] if 'bbox' in ann: # print('bbox in ann',imgId)
bbox = ann['bbox']
xmin = (int)(bbox[0])
ymin = (int)(bbox[1])
xmax = (int)(bbox[2] + bbox[0])
ymax = (int)(bbox[3] + bbox[1])
obj = [name, 1.0, xmin, ymin, xmax, ymax] #标错框在这里
if not(xmin-xmax==0 or ymin-ymax==0):
objs.append(obj)
else: print(f"{dataset}:{imgId}bbox在标注文件中不存在")# 单张图有多个标注框,某个类别没有框
annopath = os.path.join(ann_savepath,filename[:-3] + "xml") #生成的xml文件保存路径
dst_path = os.path.join(img_savepath,filename)
im = Image.open(filepath)
image = np.array(im).astype(np.uint8) if im.mode != "RGB":
# if img.shape[-1] != 3:
# print(f"{dataset}:{imgId}该文件非rgb图,其复制到{dataset}_notrgb_result中,以使查看")
# print(f"img.shape{image.shape} and img.mode{im.mode}")
not_rgb.append(imgId)
result_path = os.path.join(sourcedir,dataset+"_notrgb_result")
dest_path = os.path.join(result_path,filename) if not os.path.exists(result_path):
os.makedirs(result_path)
cover_copy(filepath,dest_path) #复制到notrgb_result来方便查看
im=im.convert('RGB')
image = np.array(im).astype(np.uint8)
im.save(dst_path,quality=95)#图片经过转换后,放到我们需要的位置片
im.close() else:
cover_copy(filepath, dst_path)#把原始图像复制到目标文件夹
E = objectify.ElementMaker(annotate=False)
anno_tree = E.annotation(
E.folder('VOC'),
E.filename(filename),
E.source(
E.database('COCO'),
E.annotation('VOC'),
E.image('COCO')
),
E.size(
E.width(image.shape[1]),
E.height(image.shape[0]),
E.depth(image.shape[2])
),
E.segmented(0)
) for obj in objs:
E2 = objectify.ElementMaker(annotate=False)
anno_tree2 = E2.object(
E.name(obj[0]),
E.pose(),
E.truncated("0"),
E.difficult(0),
E.bndbox(
E.xmin(obj[2]),
E.ymin(obj[3]),
E.xmax(obj[4]),
E.ymax(obj[5])
)
)
anno_tree.append(anno_tree2)
etree.ElementTree(anno_tree).write(annopath, pretty_print=True) print(f"{dataset}该数据集有{len(no_ann)}/{len(imgIds)}张图片没有instance标注信息,已经这些图片复制到{dataset}_noann_result中以使进行查看") print(f"{dataset}该数据集有{len(not_rgb)}/{len(imgIds)}张图片是非RGB图像,已经这些图片复制到{dataset}_notrgb_result中以使进行查看")
duriation = time.time()-start print(f"数据集{dataset}处理完成用时{round(duriation/60,2)}分")#run coco2voccoco2voc()#voc2coco!cd voc2coco && python voc2coco.py ../VOCdevkit/COCO_VOC/Annotations ../VOCdevkit/COCO_VOC/COCO.json
!mkdir COCO_all
!mv VOCdevkit/COCO_VOC/JPEGImages COCO_all/
!mv VOCdevkit/COCO_VOC/COCO.json COCO_all/一些比赛不会像导盲赛事一样不向你提供test数据集让你把网络提交上去,而是会选择把test数据集的照片给你,让你预测完了把预测结果交上去,那么这个时候伪标签就有用了,简单来说就是将预测的结果给当成标注信息再和赛事提供的原数据一起训练一般来说会上升百分之0.几个点,但是有的赛事不允许伪标签。 我看了一眼咱们这个比赛规则并没有限制这个。但是很遗憾啦,这个赛事并没有给你提供test照片呀
但是我相信有些财大气粗,底蕴雄厚的学校肯定积累过这些数据,但是其中可能很多都没有标注,一个个标注也不太现实,那么你就可以尝试使用伪标签训练的形式进行训练,而这种训练方式也有一个比较高大上的名字半监督学习
这里引用知乎的文章简单介绍一下半监督学习的方式
来源伪标签(Pseudo-Labelling)——锋利的匕首
入门版1. 使用标记数据训练有监督模型M2. 使用有监督模型M对无标签数据进行预测,得出预测概率P3. 通过预测概率P筛选高置信度样本4. 使用有标记数据以及伪标签数据训练新模型M’
进阶版1. 使用标记数据训练有监督模型M2. 使用有监督模型M对无标签数据进行预测,得出预测概率P3. 通过预测概率P筛选高置信度样本4. 使用有标记数据以及伪标签数据训练新模型M’5. 将M替换为M’,重复以上步骤直至模型效果不出现提升
创新版1. 使用标记数据训练有监督模型M2. 使用有监督模型M对无标签数据进行预测,得出预测概率P3. 将模型损失函数改为Loss = loss(labeled_data) + alpha*loss(unlabeled_data)4. 使用有标记数据以及伪标签数据训练新模型M’
相信大部分人使用的都是PaddleDetection,那么在训练策略方面有些事情你可能需要注意一下
下面我就把我的一个训练策略开放给大家做一个参考
epoch: 400LearningRate:
base_lr: 0.00025
schedulers:
- !CosineDecay
max_epochs: 500
- !LinearWarmup
start_factor: 0.
epochs: 5OptimizerBuilder:
clip_grad_by_norm: 0.1
regularizer: false
optimizer:
type: AdamW
weight_decay: 0.0001# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.# Code was based on https://github.com/facebookresearch/ConvNeXt# import math# from numbers import Integral# import paddle# import paddle.nn as nn# import paddle.nn.functional as F# from ppdet.core.workspace import register, serializable# from paddle.regularizer import L2Decay# from paddle.nn.initializer import Uniform# from paddle import ParamAttr# from paddle.nn.initializer import Constant# from paddle.vision.ops import DeformConv2D# from .name_adapter import NameAdapter# from ppdet.modeling.shape_spec import ShapeSpecimport paddleimport paddle.nn as nnimport paddle.nn.functional as F# __all__ = ['ConvNeXt']trunc_normal_ = nn.initializer.TruncatedNormal(std=0.02)
zeros_ = nn.initializer.Constant(value=0.0)
ones_ = nn.initializer.Constant(value=1.0)class Identity(nn.Layer):
def __init__(self):
super().__init__() def forward(self, x):
return xdef drop_path(x, drop_prob=0.0, training=False):
if drop_prob == 0.0 or not training: return x
keep_prob = paddle.to_tensor(1 - drop_prob)
shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
random_tensor = paddle.floor(random_tensor) # binarize
output = x.divide(keep_prob) * random_tensor return outputclass DropPath(nn.Layer):
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob def forward(self, x):
return drop_path(x, self.drop_prob, self.training)class Block(nn.Layer):
""" ConvNeXt Block. There are two equivalent implementations:
(1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
(2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
Args:
dim (int): Number of input channels.
drop_path (float): Stochastic depth rate. Default: 0.0
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
"""
def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
super().__init__()
self.dwconv = nn.Conv2D(dim, dim, kernel_size=7, padding=3,
groups=dim) # depthwise conv
self.norm = LayerNorm(dim, epsilon=1e-6)
self.pwconv1 = nn.Linear(
dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
self.act = nn.GELU()
self.pwconv2 = nn.Linear(4 * dim, dim)
self.gamma = paddle.create_parameter(
shape=[dim],
dtype='float32',
default_initializer=nn.initializer.Constant(
value=layer_scale_init_value)
) if layer_scale_init_value > 0 else None
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() def forward(self, x):
input = x
x = self.dwconv(x)
x = x.transpose([0, 2, 3, 1]) # (N, C, H, W) -> (N, H, W, C)
x = self.norm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.pwconv2(x) if self.gamma is not None:
x = self.gamma * x
x = x.transpose([0, 3, 1, 2]) # (N, H, W, C) -> (N, C, H, W)
x = input + self.drop_path(x) return xclass LayerNorm(nn.Layer):
""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
shape (batch_size, height, width, channels) while channels_first corresponds to inputs
with shape (batch_size, channels, height, width).
"""
def __init__(self,
normalized_shape,
epsilon=1e-6,
data_format="channels_last"):
super().__init__()
self.weight = paddle.create_parameter(shape=[normalized_shape],
dtype='float32',
default_initializer=ones_)
self.bias = paddle.create_parameter(shape=[normalized_shape],
dtype='float32',
default_initializer=zeros_)
self.epsilon = epsilon
self.data_format = data_format if self.data_format not in ["channels_last", "channels_first"]: raise NotImplementedError
self.normalized_shape = (normalized_shape, ) def forward(self, x):
if self.data_format == "channels_last": return F.layer_norm(x, self.normalized_shape, self.weight,
self.bias, self.epsilon) elif self.data_format == "channels_first":
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / paddle.sqrt(s + self.epsilon)
x = self.weight[:, None, None] * x + self.bias[:, None, None] return x# @register# @serializableclass ConvNeXt(nn.Layer):
""" ConvNeXt
A Paddle impl of : `A ConvNet for the 2020s` -
https://arxiv.org/pdf/2201.03545.pdf
Args:
in_chans (int): Number of input image channels. Default: 3
depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
drop_path_rate (float): Stochastic depth rate. Default: 0.
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
"""
def __init__(
self,
in_chans=3,
out_channals=[1, 2, 3],
depths=[3, 3, 9, 3],
dims=[96, 192, 384, 768],
drop_path_rate=0.,
layer_scale_init_value=1e-6, ):
super().__init__()
self._out_strides = [4, 8, 16, 32]
self.dims = dims
self.out_channals = out_channals
self.downsample_layers = nn.LayerList(
) # stem and 3 intermediate downsampling conv layers
stem = nn.Sequential(
nn.Conv2D(in_chans, dims[0], kernel_size=4, stride=4),
LayerNorm(dims[0], epsilon=1e-6, data_format="channels_first"))
self.downsample_layers.append(stem) for i in range(3):
downsample_layer = nn.Sequential(
LayerNorm(dims[i], epsilon=1e-6, data_format="channels_first"),
nn.Conv2D(dims[i], dims[i + 1], kernel_size=2, stride=2),
)
self.downsample_layers.append(downsample_layer)
self.stages = nn.LayerList(
) # 4 feature resolution stages, each consisting of multiple residual blocks
dp_rates = [
x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths))
]
cur = 0
for i in range(4):
stage = nn.Sequential(*[
Block(dim=dims[i],
drop_path=dp_rates[cur + j],
layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])
])
self.stages.append(stage)
cur += depths[i]
self.norm = nn.LayerNorm(dims[-1], epsilon=1e-6) # final norm layer
self.apply(self._init_weights) def _init_weights(self, m):
if isinstance(m, (nn.Conv2D, nn.Linear)):
trunc_normal_(m.weight)
zeros_(m.bias) # @property
def out_shape(self):
return [
ShapeSpec(
channels=self.dims[i], stride=self._out_strides[i]) for i in self.out_channals
] def forward(self, x):
x = x
outs = [] for i in range(4):
x = self.downsample_layers[i](x)
x = self.stages[i](x) if i in self.out_channals:
outs.append(x) return outsif __name__ == "__main__":
model = ConvNeXt()
paddle.summary(model, (1, 3, 640, 640))以上就是导盲赛事第二弹: 数据集与训练策略的详细内容,更多请关注php中文网其它相关文章!
每个人都需要一台速度更快、更稳定的 PC。随着时间的推移,垃圾文件、旧注册表数据和不必要的后台进程会占用资源并降低性能。幸运的是,许多工具可以让 Windows 保持平稳运行。
Copyright 2014-2025 https://www.php.cn/ All Rights Reserved | php.cn | 湘ICP备2023035733号