dataloader.py

# Copyright 2019-2020 Stanislav Pidhorskyi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import torch.utils.data
import pickle
from os import path
import dlutils
import numpy as np
import torch
import torch.tensor
import torch.utils
import torch.utils.data
import imageio
import PIL.Image as Image
import io
import random

cpu = torch.device('cpu')


class Dataset:
    @staticmethod
    def list_of_pairs_to_numpy(l):
        return np.asarray([x[1] for x in l], np.float32), np.asarray([x[0] for x in l], np.int)

    def __init__(self, data):
        self.x, self.y = Dataset.list_of_pairs_to_numpy(data)

    def __getitem__(self, index):
        if isinstance(index, slice):
            return self.y[index.start:index.stop], self.x[index.start:index.stop]
        return self.y[index], self.x[index]

    def __len__(self):
        return len(self.y)

    def shuffle(self):
        permutation = np.random.permutation(self.y.shape[0])
        for x in [self.y, self.x]:
            np.take(x, permutation, axis=0, out=x)


def make_datasets(cfg, logger, folding_id, inliner_classes):
    if cfg.DATASET.COVID:
        logger.info("COVID!!!!!!")
        random.seed(0)

        def openim(img):
            return np.asarray(Image.open(io.BytesIO(img)))

        with open(cfg.DATASET.PATH % "train", 'rb') as pkl:
            data_train = pickle.load(pkl)

        with open(cfg.DATASET.PATH % "test", 'rb') as pkl:
            data_test = pickle.load(pkl)

        random.shuffle(data_train)
        random.shuffle(data_test)
        s = int(len(data_train) * 0.9)
        data_valid = data_train[s:]
        data_train = data_train[:s]

        data_train = [(x[1], openim(x[0])[None, :, :, 0]) for x in data_train if x[1] == 0]
        outlier = cfg.DATASET.OUTLIER

        data_valid = [(x[1], openim(x[0])[None, :, :, 0]) for x in data_valid if x[1] == 0 or x[1] == outlier]
        data_test = [(x[1], openim(x[0])[None, :, :, 0]) for x in data_test if x[1] == 0 or x[1] == outlier]

        train_set = Dataset(data_train)
        valid_set = Dataset(data_valid)
        test_set = Dataset(data_test)

        return train_set, valid_set, test_set

    if cfg.DATASET.OFFICIAL_SPLIT:
        logger.info("Using official split!!!!!!")

        with open(cfg.DATASET.PATH % "train", 'rb') as pkl:
            data_train = pickle.load(pkl)

        with open(cfg.DATASET.PATH % "valid", 'rb') as pkl:
            data_valid = pickle.load(pkl)

        with open(cfg.DATASET.PATH % "test", 'rb') as pkl:
            data_test = pickle.load(pkl)

        outlier_classes = []
        for i in range(cfg.DATASET.TOTAL_CLASS_COUNT):
            if i not in inliner_classes:
                outlier_classes.append(i)

        data_train = [x for x in data_train if x[0] in inliner_classes]

        train_set = Dataset(data_train)
        valid_set = Dataset(data_valid)
        test_set = Dataset(data_test)

        return train_set, valid_set, test_set

    logger.info("Using non-official, randomized  split!!!!!!")
    data_train = []
    data_valid = []

    for i in range(cfg.DATASET.FOLDS_COUNT):
        if i != folding_id:
            with open(cfg.DATASET.PATH % i, 'rb') as pkl:
                fold = pickle.load(pkl)
            if len(data_valid) == 0:
                data_valid = fold
            else:
                data_train += fold

    if cfg.DATASET.MIX_VALIDATION_AND_TRAINING:
        data_train += data_valid
        data_valid += data_train

    outlier_classes = []
    for i in range(cfg.DATASET.TOTAL_CLASS_COUNT):
        if i not in inliner_classes:
            outlier_classes.append(i)

    data_train = [x for x in data_train if x[0] in inliner_classes]

    with open(cfg.DATASET.PATH % folding_id, 'rb') as pkl:
        data_test = pickle.load(pkl)

    data_train *= cfg.DATASET.TRAIN_MUL
    train_set = Dataset(data_train)
    valid_set = Dataset(data_valid)
    test_set = Dataset(data_test)

    return train_set, valid_set, test_set


def make_dataloader(dataset, batch_size, device):
    class BatchCollator(object):
        def __init__(self, device):
            self.device = device

        def __call__(self, batch):
            with torch.no_grad():
                y, x = batch
                x = torch.tensor(x / 255.0, requires_grad=True, dtype=torch.float32, device=self.device)
                y = torch.tensor(y, dtype=torch.int32, device=self.device)
                return y, x

    data_loader = dlutils.batch_provider(dataset, batch_size, BatchCollator(device))
    return data_loader


def create_set_with_outlier_percentage(dataset, inliner_classes, target_percentage, concervative=True):
    np.random.seed(0)
    dataset.shuffle()
    dataset_outlier = [x for x in dataset if x[0] not in inliner_classes]
    dataset_inliner = [x for x in dataset if x[0] in inliner_classes]

    def increase_length(data_list, target_length):
        repeat = (target_length + len(data_list) - 1) // len(data_list)
        data_list = data_list * repeat
        data_list = data_list[:target_length]
        return data_list

    if not concervative:
        raise RuntimeError("Don't use it, too noisy! We do want to utilize all the data that we have and not throw away anything")
        inliner_count = len(dataset_inliner)
        outlier_count = inliner_count * target_percentage // (100 - target_percentage)

        if len(dataset_outlier) > outlier_count:
            dataset_outlier = dataset_outlier[:outlier_count]
        else:
            outlier_count = len(dataset_outlier)
            inliner_count = outlier_count * (100 - target_percentage) // target_percentage
            dataset_inliner = dataset_inliner[:inliner_count]
    else:
        inliner_count = len(dataset_inliner)
        outlier_count = len(dataset_outlier)

        current_percentage = outlier_count * 100 / (outlier_count + inliner_count)

        if current_percentage < target_percentage:  # we don't have enought outliers
            outlier_count = int(inliner_count * target_percentage / (100.0 - target_percentage))
            dataset_outlier = increase_length(dataset_outlier, outlier_count)
        else:  # we don't have enought inliers
            inlier_count = int(outlier_count * (100.0 - target_percentage) / target_percentage)
            dataset_inliner = increase_length(dataset_inliner, inlier_count)

    dataset = Dataset(dataset_outlier + dataset_inliner)

    dataset.shuffle()

    # Post checks
    outlier_count = len([1 for x in dataset if x[0] not in inliner_classes])
    inliner_count = len([1 for x in dataset if x[0] in inliner_classes])
    real_percetage = outlier_count * 100.0 / (outlier_count + inliner_count)
    # assert abs(real_percetage - target_percentage) < 0.01, "Didn't create dataset with requested percentage of outliers"

    return dataset