Jul 19, 2025●14 reads

Rabbit Detection

b
Bryte Wiredu

The training partition of the rabbit dataset is pre-augmented to create 30 versions of each source image with:

Random brigthness adjustment of between -25 and +25 percent
Random exposure adjustment of between -25 and +25 percent
Random cutout with 3 boxes

import torch
from torch.utils.data import DataLoader
from utils.dataset import RabbitDataset
from utils.loss import YOLOLoss
from utils.utils import plot_image_with_boxes, compute_iou, find_best_bbox, draw_bounding_box
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm import tqdm

# Set random seed for consistent results. DO NOT CHANGE!
torch.manual_seed(42)

# Set dataset paths
data_dir = Path("./Cottontail-Rabbits")
train_ann_file = data_dir / "train" / "_annotations.coco.json"
valid_ann_file = data_dir / "valid" / "_annotations.coco.json"
test_ann_file = data_dir / "test" / "_annotations.coco.json"

train_dataset = RabbitDataset(train_ann_file, apply_transform=True)
valid_dataset = RabbitDataset(valid_ann_file)
test_dataset = RabbitDataset(test_ann_file)

print()
print(f"Train dataset contains: {len(train_dataset)} images")
print(f"Validation dataset contains: {len(valid_dataset)} images")
print(f"Test dataset contains: {len(test_dataset)} images")

1.1 Visualize the output of dataset

The dataset gives as output a tuple that contains (img, target):

img: A torch tensor of shape [channels, height, width], representing the image.
target: A torch tensor of shape ...

# Select an index
idx = 0
image, target = valid_dataset[idx]

# Visualize the image
plot_image_with_boxes(image, target)

1.2 Define dataloaders and the target device

Set batch_size and num_workers to values suitable for you PC.

batch_size affects how many images is in each batch
num_workers affects how many subprocesses is used for data loading

train_loader = DataLoader(train_dataset, batch_size=128, num_workers=4, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=128, num_workers=4, shuffle=False)

# Automatically sets device to cuda if cuda is available. Otherwise uses cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

1.3 Define training and validation loops

# Training loop
def train_epoch(loader, model, criterion, optimizer):
    # Set model to training mode
    model.train()
    
    # Define epoch loss
    epoch_loss = 0
    losses = []
    for images, targets in tqdm(loader):
        
        # Move images to device
        images = images.to(device)
        targets = targets.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, targets)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        losses.append(loss.item())

    # Return average loss 
    return losses

# Validation loop
def validate(loader, model, criterion):
    # Set model to validation mode
    model.eval()
    
    # Define epoch loss
    val_loss = 0
    
    # During validation we can skip gradient computation
    with torch.no_grad():
        for images, targets in loader:
            
            # Move images to device
            images = images.to(device)
            targets = targets.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, targets)

            val_loss += loss.item()
            
            # Optional plotting
            # plot_image_with_boxes(images[0].cpu(), outputs[0].cpu())

    # Return average loss 
    return val_loss / len(loader)

1.4 Define the model

If you are not that familiar with neural networks and do not know what forward pass means check out this tutorial.

import torch
import torch.nn as nn
import torch.nn.functional as F
import utils.config as config

from torchvision.models import alexnet, resnet18, resnet50

class YOLOv1(nn.Module):
    def __init__(self, backbone_type="alexnet"):
        super().__init__()
        ##### Do not change. 
        # Load backbone
        if backbone_type == "alexnet":
            backbone = alexnet(weights="DEFAULT")
            out_channels = 256
        elif backbone_type == "resnet18":
            backbone = resnet18(weights="DEFAULT")
            out_channels = 512
        elif backbone_type == "resnet50":
            backbone = resnet50(weights="DEFAULT")
            out_channels = 2048
        else:
            raise ValueError('backbone_type should be alexnet, resnet18, or resnet50')

        # Freeze backbone weights
        backbone.requires_grad_(False)

        # Delete last two layers as they contain the classifier
        backbone = nn.Sequential(*list(backbone.children())[:-2])
        #####
        
        # Ensures that the output from any backbone is same size 
        # Resizes the output to (B, 1024, 6, 6)
        self.channels = 1024
        resize = nn.Sequential(
            nn.AdaptiveAvgPool2d((6, 6)), nn.Conv2d(out_channels, self.channels, kernel_size=1)
        )

        # put it together so you do not forget to resize anything
        self.backbone = nn.Sequential(backbone, resize)

        # Match output size
        self.detector = DetectionNet(self.channels)

    def forward(self, x):
        out = self.backbone(x)
        out = self.detector(out)  # Pass through detection network

        return out

class DetectionNet(nn.Module):
    """Network taking in the features from pretrained backbone and outputting the predictions."""

    def __init__(self, in_channels):
        super().__init__()
        
        # Define the detection head
        S, B, C = config.GRID_SIZE,config.N_BBOX, config.N_CLASSES
        self.conv_layers = nn.Sequential(
                    nn.Conv2d(in_channels, 512, kernel_size=3, padding=1),
                    nn.ReLU(),
                    nn.Conv2d(512, 256, kernel_size=3, padding=1),
                    nn.ReLU(),
                    nn.Conv2d(256, 128, kernel_size=3, padding=1),
                    nn.ReLU(),
                    nn.Conv2d(128, S * S * (B * 5 + C), kernel_size=1)  # Output layer
                )

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.shape[0], 7, 7, -1)  # Reshape to (B, S, S, B*5 + C)
        return x

1.5 Train the model

# Instantiate the model:
model = YOLOv1(backbone_type="alexnet")
model.to(device)

# define loss and optimizer
loss_fn = YOLOLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# train
num_epochs = 12
losses = []
val_losses = []
for epoch in range(num_epochs):
    epoch_losses = train_epoch(train_loader, model, loss_fn, optimizer)
    val_loss = validate(valid_loader, model, loss_fn)

    train_loss = sum(epoch_losses) / len(train_loader)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    losses.extend(epoch_losses)
    val_losses.append(val_loss)

Plot losses

plt.figure(figsize=(12, 4))
plt.subplot(1,2,1)
plt.plot(losses)
plt.title("Training Loss")
plt.grid()

plt.subplot(1,2,2)
plt.plot(val_losses)
plt.title("Validation Loss")
plt.grid()
plt.show()

1.6 Test the model

model.eval()
with torch.no_grad():
    fig, ax = plt.subplots(2, 5, figsize=(16, 6))
    
    iou_scores = []  # Store IoU scores
    # Flatten axes for easier indexing if multiple rows
    ax = ax.flatten()
    
    for idx, (image, target) in enumerate(test_dataset):
        # Get the tuple
        image_model = image.unsqueeze(0).to(device)
        pred = model(image_model).cpu().squeeze(0)
        pred_bbox, confidence = find_best_bbox(pred, image.shape[1])
        target_bbox, _ = find_best_bbox(target, image.shape[1])
        iou = compute_iou(pred_bbox, target_bbox)
        iou_scores.append(iou)
        # Unnormalize
        image = image*0.25 + 0.5
        
        ax[idx].imshow(image.permute(1, 2, 0).numpy())
        # Draw bounding box
        draw_bounding_box(ax[idx], pred_bbox, confidence)
        draw_bounding_box(ax[idx], target_bbox, confidence=None, color="blue")
        ax[idx].set_title(f"IoU: {iou:.3f}")
        ax[idx].axis("off")
    plt.show()

print(f"Data Target: {target}")
print(f"Data Target shape: {target.shape}")

# Model Instantialization with input channels = 1024 (from YOLOv1 backbone)
Dect_model = DetectionNet(in_channels=1024)

# Compute the number of parameters
num_params = sum(p.numel() for p in Dect_model.parameters())

print(f"Total number of parameters in DetectionNet is: {num_params}")

from torchinfo import summary

summary(Dect_model, input_size=(1, 1024, 6, 6))

# Define dummy input tensor simulating output from YOLOv1 backbone
batch_size = 5  # Batch size
in_channels = 1024  # Feature map channels from backbone
feature_map_size = 6  # Feature map size from backbone

#  Dummy input tensor
dummy_input = torch.randn(batch_size, in_channels, feature_map_size, feature_map_size)

# Instantiate the model
Dect_model = DetectionNet(in_channels=in_channels)

# Perform a forward pass to get output shape
output = Dect_model(dummy_input)

# Print the shape of the output
print("Output shape:", output.shape)

for idx, (image, target) in enumerate(test_dataset):
    # Get the tuple
    image_model = image.unsqueeze(0).to(device)
    pred = model(image_model).cpu().squeeze(0)
    #chec_outputs = model(test_dataset[0])
    print(image.shape[1])
    break

import numpy as np
# Computing average IoU
average_iou = np.mean(iou_scores)
print(f" Average IoU on Test Set: {average_iou:.3f}")

# Identify poor IoU cases
poor_iou_images = [i for i, iou in enumerate(iou_scores) if iou < 0.5]
print(f" Test images with poor IoU (< 0.5): {poor_iou_images}")

def acc_validator(model,test_dataset):
    model.eval()
    with torch.no_grad():
        iou_scores = []  # Store IoU scores

        for _, (image, target) in enumerate(test_dataset):
            # Get the tuple
            image_model = image.unsqueeze(0).to(device)
            pred = model(image_model).cpu().squeeze(0)
            pred_bbox, _ = find_best_bbox(pred, image.shape[1])
            target_bbox, _ = find_best_bbox(target, image.shape[1])
            iou = compute_iou(pred_bbox, target_bbox)
            iou_scores.append(iou)
        average_iou = np.mean(iou_scores)
    return average_iou
            
def compare_model():
    models = ["alexnet", "resnet18", "resnet50"]
    model_losses = []
    acc_model = []
    for model_name in models:
        # Instantiate the model:
        model = YOLOv1(backbone_type=model_name)
        model.to(device)

        # define loss and optimizer
        loss_fn = YOLOLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

        # train
        num_epochs = 3
        losses = []

        for epoch in range(num_epochs):
            epoch_losses = train_epoch(train_loader, model, loss_fn, optimizer)

            train_loss = sum(epoch_losses) / len(train_loader)

            print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}")
            losses.extend(epoch_losses)
        model_losses.append(losses)
        acc_model.append(acc_validator(model,test_dataset))
    return model_losses, acc_model

def plot_acc_loss(n_name,loss,acc):
    # Create subplots
    fig, axes = plt.subplots(1, len(models), figsize=(15, 5))

    for i, ax in enumerate(axes):
        ax.plot(loss[i], marker='o', linestyle='-', label=f"{n_name[i]} Loss")
        ax.set_title(f"{n_name[i]} - Acc: {acc[i]}%")
        ax.set_ylabel("Loss")
        ax.legend()
        ax.grid(True)

    # Adjust layout
    plt.tight_layout()
    plt.show()

l,a = compare_model()

models = ["alexnet", "resnet18", "resnet50"]
plot_acc_loss(models,l,a)