data_loader.py

import torch
import torchvision.transforms as transforms
import torch.utils.data as data
import os
import pickle
import numpy as np
import nltk
from PIL import Image
from build_vocab import Vocabulary
from pycocotools.coco import COCO
import random

class CocoDataset(data.Dataset):
    """COCO Custom Dataset compatible with torch.utils.data.DataLoader."""
    def __init__(self, root, json, vocab, vocab_size, max_seq_length, transform=None):
        """Set the path for images, captions and vocabulary wrapper.
        
        Args:
            root: image directory.
            json: coco annotation file path.
            vocab: vocabulary wrapper.
            transform: image transformer.
        """
        self.root = root
        self.coco = COCO(json)
        self.ids = list(self.coco.anns.keys())
        self.vocab = vocab
        self.vocab_size = vocab_size
        self.max_seq_length = max_seq_length
        self.transform = transform
        # for cap_id, value in self.coco.anns.items():
        #     img_id = value['image_id']
        #     print (cap_id, img_id, cap_id-img_id*5, value)

    def __getitem__(self, index):
        """Returns one data pair (image and caption)."""
        coco = self.coco
        vocab = self.vocab
        ann_id = self.ids[index]
        img_id = coco.anns[ann_id]['image_id']
        path = coco.loadImgs(img_id)[0]['file_name']
        image = Image.open(os.path.join(self.root, path)).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)

        caption = coco.anns[ann_id]['caption']
        tokens = nltk.tokenize.word_tokenize(str(caption).lower())
        caption = []
        caption.append(vocab('<start>'))
        caption.extend([vocab(token) for token in tokens])
        caption.append(vocab('<end>'))
        while len(caption) < self.max_seq_length:
            caption.append(vocab('<end>'))
        if len(caption) > self.max_seq_length:
            caption = caption[:self.max_seq_length]
        caption = torch.Tensor(caption)

        # Convert words to mul_class. 
        mul_caption = []
        for cur_id in range(img_id*5, img_id*5+5):
            cur_caption = coco.anns[cur_id]['caption']
            cur_tokens = nltk.tokenize.word_tokenize(str(cur_caption).lower())
            cur_caption = [] 
            cur_caption.append(vocab('<start>'))
            cur_caption.extend([vocab(cur_token) for cur_token in cur_tokens])
            cur_caption.append(vocab('<end>'))
            while len(cur_caption) < self.max_seq_length:
                cur_caption.append(vocab('<end>'))
            mul_caption.extend([cap for cap in cur_caption])
        mul_caption = torch.Tensor(mul_caption)

        # print ('A: ', caption_A)
        # print ('B: ', caption_B)
        # print ('C: ', caption_C)
        # print ('D: ', caption_D)
        # print ('E: ', caption_E)
        # print ()

        # A: multi_captions
        mul_class = torch.zeros(mul_caption.size(0), self.vocab_size)  \
                       .scatter_(1, mul_caption.long().view(mul_caption.size(0), 1), 1)  \
                       .sum(dim=0)
        mul_class = (mul_class / (mul_class + 0.00001) + 0.1).int().float()

        return image, mul_class, caption

    def __len__(self):
        return len(self.ids)


def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (image, caption).
    
    We should build custom collate_fn rather than using default collate_fn, 
    because merging caption (including padding) is not supported in default.

    Args:
        data: list of tuple (image, caption). 
            - image: torch tensor of shape (3, 256, 256).
            - caption: torch tensor of shape (?); variable length.

    Returns:
        images: torch tensor of shape (batch_size, 3, 256, 256).
        targets: torch tensor of shape (batch_size, padded_length).
        lengths: list; valid length for each padded caption.
    """
    # Sort a data list by caption length (descending order).
    images, classes, captions = zip(*data)

    # Merge images (from tuple of 3D tensor to 4D tensor).
    images = torch.stack(images, 0)
    classes = torch.stack(classes, 0)
    captions = torch.stack(captions, 0).long()

    # Merge captions (from tuple of 1D tensor to 2D tensor).

    # return images, classes, targets_A, captions_B, captions_C, captions_D, captions_E
    return images, classes, captions


def build_datasets(args, vocab):
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([ 
        transforms.Resize(args.image_size),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Build data loader
    train_dataset = CocoDataset(root=args.train_dir.replace('dataset', args.dataset),
                                json=args.train_caption_path.replace('dataset', args.dataset),
                                vocab=vocab,
                                vocab_size=len(vocab),
                                max_seq_length=args.max_seq_length,
                                transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.num_workers,
                                               collate_fn=collate_fn)

    return train_loader