How to write training loop for object detection using rastervision
and pytorch-lightning
#1769
-
I am attmepting to use Here is my model definition class ObjectDetection(pl.LightningModule):
def __init__(self, backbone, lr=1e-4):
super().__init__()
self.backbone = TorchVisionODAdapter(backbone)
self.lr = lr
def forward(self, img):
return self.backbone(img)
def training_step(self, batch, batch_idx):
print("sanity training")
image, target = batch
loss_dict = self.backbone(image, target)
losses = sum(loss for loss in loss_dict.values())
batch_size = len(batch[0])
self.log_dict(loss_dict, batch_size=batch_size)
self.log("train_loss", losses, batch_size=batch_size)
return losses
def validation_step(self, batch, batch_idx):
print("sanity validation")
image, target = batch
# error occurs here
loss_dict = self.backbone(image, target)
losses = sum(loss for loss in loss_dict.values())
batch_size = len(batch[0])
self.log_dict(loss_dict, batch_size=batch_size)
self.log("val_loss", losses, batch_size=batch_size)
return losses
def configure_optimizers(self):
optimizer = torch.optim.Adam(
self.backbone.parameters(), lr=self.lr)
return optimizer And here is my training code def train(self):
kw = self.kw.get("train_kw", {})
lr = float(kw.get("lr", 1e-4))
epochs = kw.get("epochs", 1)
output_dir = self.output_uri
make_dir(output_dir)
fast_dev_run = False
backbone = fasterrcnn_resnet50_fpn_v2(
num_classes=len(self.cc), pretrained=True)
model = ObjectDetection(backbone, lr=lr)
tb_logger = TensorBoardLogger(save_dir=output_dir + "/tensorboard", flush_secs=10)
trainer = pl.Trainer(
accelerator='auto',
min_epochs=1,
max_epochs=epochs+1,
default_root_dir=output_dir + "/trainer",
logger=[tb_logger],
fast_dev_run=fast_dev_run,
log_every_n_steps=1,
)
train_dl, val_dl = self.build_train_val_loader()
trainer.fit(model, train_dl, val_dl)
trainer.save_checkpoint(output_dir + "/trainer/final-model.ckpt") And here is the error message I get when I run the
But when I run the
I believe this error occurs because my prediction has many more bounding boxes than my ground truth, but that seems like a normal issue that would occur in training, so I'm not sure how to fix it. Any help is appreciated |
Beta Was this translation helpful? Give feedback.
Replies: 3 comments 7 replies
-
with the following changes I'm able to run my model, but it throws an error any time there is a different number of ground truth boxes and predicted boxes (which happens every time) def boxlist_to_tensor(self, bl):
bl = [self.backbone.boxlist_to_model_input_dict(b) for b in bl]
boxes = [b["boxes"] for b in bl]
labels = [b["labels"] for b in bl]
boxes = torch.vstack(boxes)
labels = torch.concat(labels).float()
return boxes, labels
def training_step(self, batch, batch_idx):
print("Sanity training")
x, y = batch
y_hat = self.backbone.forward(x)
box_hat, label_hat = self.boxlist_to_tensor(y_hat)
box, label = self.boxlist_to_tensor(y)
box_loss = generalized_box_iou_loss(box, box_hat)
label_loss = F.mse_loss(label, label_hat)
return box_loss + label_loss
def validation_step(self, batch, batch_idx):
print("Sanity validation")
x, y = batch
y_hat = self.backbone.forward(x)
box_hat, label_hat = self.boxlist_to_tensor(y_hat)
box, label = self.boxlist_to_tensor(y)
box_loss = generalized_box_iou_loss(box, box_hat)
label_loss = F.mse_loss(label, label_hat)
return box_loss + label_loss I get errors such as
|
Beta Was this translation helpful? Give feedback.
-
No, the 'index' in the error is referring to the class ID. Try the following:
Torchvision OD models behave differently during validation. Instead of returning losses, they return predicted boxes. This might help: https://github.com/azavea/raster-vision/blob/master/rastervision_pytorch_learner/rastervision/pytorch_learner/object_detection_learner.py#L64-L91 |
Beta Was this translation helpful? Give feedback.
-
Youre a legend Adeel! Appreciate the help so much. Sorry for the super long tracebacks haha. If you're ever in oregon come stop by Aerotract HQ! |
Beta Was this translation helpful? Give feedback.
No, the 'index' in the error is referring to the class ID. Try the following:
num_classes=(len(class_config) + 1)
Torchvision OD models behave differently during validation. Instead of returning losses, they return predicted boxes.
This might help: https://github.com/azavea/raster-vision/blob/master/rastervision…