The training partition of the rabbit dataset is pre-augmented to create 30 versions of each source image with:
Random brigthness adjustment of between -25 and +25 percent
Random exposure adjustment of between -25 and +25 percent
Random cutout with 3 boxes
import torch from torch.utils.data import DataLoader from utils.dataset import RabbitDataset from utils.loss import YOLOLoss from utils.utils import plot_image_with_boxes, compute_iou, find_best_bbox, draw_bounding_box from pathlib import Path import matplotlib.pyplot as plt from tqdm import tqdm # Set random seed for consistent results. DO NOT CHANGE! torch.manual_seed(42) # Set dataset paths data_dir = Path("./Cottontail-Rabbits") train_ann_file = data_dir / "train" / "_annotations.coco.json" valid_ann_file = data_dir / "valid" / "_annotations.coco.json" test_ann_file = data_dir / "test" / "_annotations.coco.json" train_dataset = RabbitDataset(train_ann_file, apply_transform=True) valid_dataset = RabbitDataset(valid_ann_file) test_dataset = RabbitDataset(test_ann_file) print() print(f"Train dataset contains: {len(train_dataset)} images") print(f"Validation dataset contains: {len(valid_dataset)} images") print(f"Test dataset contains: {len(test_dataset)} images")
The dataset gives as output a tuple that contains (img, target)
:
img
: A torch tensor of shape [channels, height, width]
, representing the image.
target
: A torch tensor of shape ...
# Select an index idx = 0 image, target = valid_dataset[idx]
# Visualize the image plot_image_with_boxes(image, target)
Set batch_size and num_workers to values suitable for you PC.
batch_size
affects how many images is in each batch
num_workers
affects how many subprocesses is used for data loading
train_loader = DataLoader(train_dataset, batch_size=128, num_workers=4, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=128, num_workers=4, shuffle=False) # Automatically sets device to cuda if cuda is available. Otherwise uses cpu device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Training loop def train_epoch(loader, model, criterion, optimizer): # Set model to training mode model.train() # Define epoch loss epoch_loss = 0 losses = [] for images, targets in tqdm(loader): # Move images to device images = images.to(device) targets = targets.to(device) # Forward pass outputs = model(images) loss = criterion(outputs, targets) # Backward pass optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += loss.item() losses.append(loss.item()) # Return average loss return losses # Validation loop def validate(loader, model, criterion): # Set model to validation mode model.eval() # Define epoch loss val_loss = 0 # During validation we can skip gradient computation with torch.no_grad(): for images, targets in loader: # Move images to device images = images.to(device) targets = targets.to(device) # Forward pass outputs = model(images) loss = criterion(outputs, targets) val_loss += loss.item() # Optional plotting # plot_image_with_boxes(images[0].cpu(), outputs[0].cpu()) # Return average loss return val_loss / len(loader)
If you are not that familiar with neural networks and do not know what forward pass means check out this tutorial.
import torch import torch.nn as nn import torch.nn.functional as F import utils.config as config from torchvision.models import alexnet, resnet18, resnet50 class YOLOv1(nn.Module): def __init__(self, backbone_type="alexnet"): super().__init__() ##### Do not change. # Load backbone if backbone_type == "alexnet": backbone = alexnet(weights="DEFAULT") out_channels = 256 elif backbone_type == "resnet18": backbone = resnet18(weights="DEFAULT") out_channels = 512 elif backbone_type == "resnet50": backbone = resnet50(weights="DEFAULT") out_channels = 2048 else: raise ValueError('backbone_type should be alexnet, resnet18, or resnet50') # Freeze backbone weights backbone.requires_grad_(False) # Delete last two layers as they contain the classifier backbone = nn.Sequential(*list(backbone.children())[:-2]) ##### # Ensures that the output from any backbone is same size # Resizes the output to (B, 1024, 6, 6) self.channels = 1024 resize = nn.Sequential( nn.AdaptiveAvgPool2d((6, 6)), nn.Conv2d(out_channels, self.channels, kernel_size=1) ) # put it together so you do not forget to resize anything self.backbone = nn.Sequential(backbone, resize) # Match output size self.detector = DetectionNet(self.channels) def forward(self, x): out = self.backbone(x) out = self.detector(out) # Pass through detection network return out class DetectionNet(nn.Module): """Network taking in the features from pretrained backbone and outputting the predictions.""" def __init__(self, in_channels): super().__init__() # Define the detection head S, B, C = config.GRID_SIZE,config.N_BBOX, config.N_CLASSES self.conv_layers = nn.Sequential( nn.Conv2d(in_channels, 512, kernel_size=3, padding=1), nn.ReLU(), nn.Conv2d(512, 256, kernel_size=3, padding=1), nn.ReLU(), nn.Conv2d(256, 128, kernel_size=3, padding=1), nn.ReLU(), nn.Conv2d(128, S * S * (B * 5 + C), kernel_size=1) # Output layer ) def forward(self, x): x = self.conv_layers(x) x = x.view(x.shape[0], 7, 7, -1) # Reshape to (B, S, S, B*5 + C) return x
# Instantiate the model: model = YOLOv1(backbone_type="alexnet") model.to(device) # define loss and optimizer loss_fn = YOLOLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) # train num_epochs = 12 losses = [] val_losses = [] for epoch in range(num_epochs): epoch_losses = train_epoch(train_loader, model, loss_fn, optimizer) val_loss = validate(valid_loader, model, loss_fn) train_loss = sum(epoch_losses) / len(train_loader) print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}") losses.extend(epoch_losses) val_losses.append(val_loss)
plt.figure(figsize=(12, 4)) plt.subplot(1,2,1) plt.plot(losses) plt.title("Training Loss") plt.grid() plt.subplot(1,2,2) plt.plot(val_losses) plt.title("Validation Loss") plt.grid() plt.show()
model.eval() with torch.no_grad(): fig, ax = plt.subplots(2, 5, figsize=(16, 6)) iou_scores = [] # Store IoU scores # Flatten axes for easier indexing if multiple rows ax = ax.flatten() for idx, (image, target) in enumerate(test_dataset): # Get the tuple image_model = image.unsqueeze(0).to(device) pred = model(image_model).cpu().squeeze(0) pred_bbox, confidence = find_best_bbox(pred, image.shape[1]) target_bbox, _ = find_best_bbox(target, image.shape[1]) iou = compute_iou(pred_bbox, target_bbox) iou_scores.append(iou) # Unnormalize image = image*0.25 + 0.5 ax[idx].imshow(image.permute(1, 2, 0).numpy()) # Draw bounding box draw_bounding_box(ax[idx], pred_bbox, confidence) draw_bounding_box(ax[idx], target_bbox, confidence=None, color="blue") ax[idx].set_title(f"IoU: {iou:.3f}") ax[idx].axis("off") plt.show()
print(f"Data Target: {target}") print(f"Data Target shape: {target.shape}")
# Model Instantialization with input channels = 1024 (from YOLOv1 backbone) Dect_model = DetectionNet(in_channels=1024) # Compute the number of parameters num_params = sum(p.numel() for p in Dect_model.parameters()) print(f"Total number of parameters in DetectionNet is: {num_params}")
from torchinfo import summary summary(Dect_model, input_size=(1, 1024, 6, 6))
# Define dummy input tensor simulating output from YOLOv1 backbone batch_size = 5 # Batch size in_channels = 1024 # Feature map channels from backbone feature_map_size = 6 # Feature map size from backbone # Dummy input tensor dummy_input = torch.randn(batch_size, in_channels, feature_map_size, feature_map_size) # Instantiate the model Dect_model = DetectionNet(in_channels=in_channels) # Perform a forward pass to get output shape output = Dect_model(dummy_input) # Print the shape of the output print("Output shape:", output.shape)
for idx, (image, target) in enumerate(test_dataset): # Get the tuple image_model = image.unsqueeze(0).to(device) pred = model(image_model).cpu().squeeze(0) #chec_outputs = model(test_dataset[0]) print(image.shape[1]) break
import numpy as np # Computing average IoU average_iou = np.mean(iou_scores) print(f" Average IoU on Test Set: {average_iou:.3f}") # Identify poor IoU cases poor_iou_images = [i for i, iou in enumerate(iou_scores) if iou < 0.5] print(f" Test images with poor IoU (< 0.5): {poor_iou_images}")
def acc_validator(model,test_dataset): model.eval() with torch.no_grad(): iou_scores = [] # Store IoU scores for _, (image, target) in enumerate(test_dataset): # Get the tuple image_model = image.unsqueeze(0).to(device) pred = model(image_model).cpu().squeeze(0) pred_bbox, _ = find_best_bbox(pred, image.shape[1]) target_bbox, _ = find_best_bbox(target, image.shape[1]) iou = compute_iou(pred_bbox, target_bbox) iou_scores.append(iou) average_iou = np.mean(iou_scores) return average_iou def compare_model(): models = ["alexnet", "resnet18", "resnet50"] model_losses = [] acc_model = [] for model_name in models: # Instantiate the model: model = YOLOv1(backbone_type=model_name) model.to(device) # define loss and optimizer loss_fn = YOLOLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) # train num_epochs = 3 losses = [] for epoch in range(num_epochs): epoch_losses = train_epoch(train_loader, model, loss_fn, optimizer) train_loss = sum(epoch_losses) / len(train_loader) print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}") losses.extend(epoch_losses) model_losses.append(losses) acc_model.append(acc_validator(model,test_dataset)) return model_losses, acc_model def plot_acc_loss(n_name,loss,acc): # Create subplots fig, axes = plt.subplots(1, len(models), figsize=(15, 5)) for i, ax in enumerate(axes): ax.plot(loss[i], marker='o', linestyle='-', label=f"{n_name[i]} Loss") ax.set_title(f"{n_name[i]} - Acc: {acc[i]}%") ax.set_ylabel("Loss") ax.legend() ax.grid(True) # Adjust layout plt.tight_layout() plt.show()
l,a = compare_model()
models = ["alexnet", "resnet18", "resnet50"] plot_acc_loss(models,l,a)