Source code for fortuna.data.loader.array_loaders

from __future__ import annotations

from typing import (
    Optional,
    Tuple,
)

import numpy as np

from fortuna.data.loader.base import (
    BaseDataLoaderABC,
    BaseInputsLoader,
    BaseTargetsLoader,
)
from fortuna.data.loader.utils import IterableData
from fortuna.typing import (
    Array,
    Batch,
    Shape,
)


[docs]class DataLoader(BaseDataLoaderABC):
    @property
    def num_unique_labels(self) -> Optional[int]:
        if self._num_unique_labels is None:
            self._num_unique_labels = len(np.unique(self.to_array_targets()))
        return self._num_unique_labels

[docs]    @classmethod
    def from_array_data(
        cls,
        data: Batch,
        batch_size: Optional[int] = None,
        shuffle: bool = False,
        prefetch: bool = False,
    ) -> DataLoader:
        """
        Build a :class:`~fortuna.data.loader.DataLoader` object from a tuple of arrays of input and target variables,
        respectively.

        Parameters
        ----------
        data: Batch
            Input and target arrays of data.
        batch_size: Optional[int]
            The batch size. If not given, the data will not be batched.
        shuffle: bool
            Whether the data loader should shuffle at every call.
        prefetch: bool
            Whether to prefetch the next batch.

        Returns
        -------
        DataLoader
            A data loader built out of the tuple of arrays.
        """
        return cls(
            iterable=IterableData.from_batch_array_data(
                data, batch_size=batch_size, shuffle=shuffle, prefetch=prefetch
            )
        )

[docs]    def to_inputs_loader(self) -> InputsLoader:
        """
        Reduce a data loader to an inputs loader.

        Returns
        -------
        InputsLoader
            The inputs loader derived from the data loader.
        """
        return InputsLoader(IterableData.data_loader_to_inputs_iterable(self))

[docs]    def to_targets_loader(self) -> TargetsLoader:
        """
        Reduce a data loader to a targets loader.

        Returns
        -------
        TargetsLoader
            The targets loader derived from the data loader.
        """
        return TargetsLoader(IterableData.data_loader_to_targets_iterable(self))

[docs]    def to_array_data(self) -> Batch:
        """
        Reduce a data loader to a tuple of input and target arrays.

        Returns
        -------
        Batch
            Tuple of input and target arrays.
        """
        inputs, targets = [], []
        for batch_inputs, batch_targets in self:
            inputs.append(batch_inputs)
            targets.append(batch_targets)
        return np.concatenate(inputs, 0), np.concatenate(targets, 0)

[docs]    def to_array_inputs(self) -> Array:
        """
        Reduce a data loader to an array of target data.

        Returns
        -------
        Array
            Array of input data.
        """
        inputs = []
        for batch_inputs, batch_targets in self:
            inputs.append(batch_inputs)
        return np.concatenate(inputs, 0)

[docs]    def to_array_targets(self) -> Array:
        """
        Reduce a data loader to an array of target data.

        Returns
        -------
        Array
            Array of input data.
        """
        targets = []
        for batch_inputs, batch_targets in self:
            targets.append(batch_targets)
        return np.concatenate(targets, 0)

[docs]    def chop(self, divisor: int) -> DataLoader:
        """
        Chop the last part of each batch of the data loader, to make sure the number od data points per batch divides
        `divisor`.

        Parameters
        ----------
        divisor : int
            Number of data points that each batched must divide.

        Returns
        -------
        DataLoader
            A data loader with chopped batches.
        """

        def fun():
            for inputs, targets in self:
                reminder = targets.shape[0] % divisor
                if reminder == 0:
                    yield inputs, targets
                elif targets.shape[0] > divisor:
                    yield inputs[:-reminder], targets[:-reminder]

        return self.from_callable_iterable(fun)

[docs]    def split(self, n_data: int) -> Tuple[DataLoader, DataLoader]:
        """
        Split a data loader into two data loaders.

        Parameters
        ----------
        n_data: int
            Number of data point after which the data loader should be split. The first returned data loader will
            contain exactly `n_data` data points. The second one will contain the remaining ones.

        Returns
        -------
        Tuple[DataLoader, DataLoader]
            The two data loaders made out of the original one.
        """

        def data_loader1():
            count = 0
            for inputs, targets in self:
                if count == n_data:
                    break
                if count + inputs.shape[0] <= n_data:
                    count += inputs.shape[0]
                    yield inputs, targets
                else:
                    inputs, targets = (
                        inputs[: n_data - count],
                        targets[: n_data - count],
                    )
                    count = n_data
                    yield inputs, targets

        def data_loader2():
            count = 0
            for inputs, targets in self:
                if count > n_data:
                    yield inputs, targets
                elif (count <= n_data) and (count + inputs.shape[0] > n_data):
                    count2 = count
                    count += inputs.shape[0]
                    inputs, targets = (
                        inputs[n_data - count2 :],
                        targets[n_data - count2 :],
                    )
                    yield inputs, targets
                else:
                    count += inputs.shape[0]

        return self.from_callable_iterable(data_loader1), self.from_callable_iterable(
            data_loader2
        )

[docs]    def sample(self, seed: int, n_samples: int) -> DataLoader:
        """
        Sample from the data loader, with replacement.

        Parameters
        ----------
        seed: int
            Random seed.
        n_samples: int
            Number of samples.

        Returns
        -------
        DataLoader
            A data loader made of the sampled data points.
        """

        def fun():
            rng = np.random.default_rng(seed)
            count = 0

            while True:
                for inputs, targets in self:
                    if count == n_samples:
                        break
                    idx = rng.choice(2, inputs.shape[0]).astype("bool")
                    inputs, targets = inputs[idx], targets[idx]
                    if count + inputs.shape[0] > n_samples:
                        inputs, targets = (
                            inputs[: n_samples - count],
                            targets[: n_samples - count],
                        )
                    count += inputs.shape[0]
                    if inputs.shape[0] > 0:
                        yield inputs, targets

                if count == n_samples:
                    break

        return self.from_callable_iterable(fun)

    @property
    def input_shape(self) -> Shape:
        def fun():
            for inputs, targets in self:
                input_shape = inputs.shape[1:]
                break
            return input_shape

        return fun()


[docs]class InputsLoader(BaseInputsLoader):
[docs]    @classmethod
    def from_array_inputs(
        cls,
        inputs: Array,
        batch_size: Optional[int] = None,
        shuffle: bool = False,
        prefetch: bool = False,
    ) -> InputsLoader:
        """
        Build a :class:`~fortuna.data.loader.InputsLoader` object from an array of input data.

        Parameters
        ----------
        inputs: Array
            Input array of data.
        batch_size: Optional[int]
            The batch size. If not given, the inputs will not be batched.
        shuffle: bool
            Whether the inputs loader should shuffle at every call.
        prefetch: bool
            Whether to prefetch the next batch.

        Returns
        -------
        InputsLoader
            An inputs loader built out of the array of inputs.
        """
        return cls(
            iterable=IterableData.from_array_data(
                inputs, batch_size=batch_size, shuffle=shuffle, prefetch=prefetch
            )
        )

[docs]    def to_array_inputs(self) -> Array:
        """
        Reduce an inputs loader to an array of inputs.

        Returns
        -------
        Array
            Array of input data.
        """
        inputs = []
        for batch_inputs in self:
            inputs.append(batch_inputs)
        return np.concatenate(inputs, 0)

[docs]    def chop(self, divisor: int) -> InputsLoader:
        """
        Chop the last part of each batch of the inputs loader, to make sure the number od data points per batch divides
        `divisor`.

        Parameters
        ----------
        divisor : int
            Number of data points that each batched must divide.

        Returns
        -------
        InputsLoader
            An inputs loader with chopped batches.
        """

        def fun():
            for inputs in self:
                reminder = inputs.shape[0] % divisor
                if reminder == 0:
                    yield inputs
                elif inputs.shape[0] > divisor:
                    yield inputs[:-reminder]

        return self.from_callable_iterable(fun)

[docs]    def sample(self, seed: int, n_samples: int) -> InputsLoader:
        """
        Sample from the inputs loader, with replacement.

        Parameters
        ----------
        seed: int
            Random seed.
        n_samples: int
            Number of samples.

        Returns
        -------
        InputsLoader
            An inputs loader made of the sampled inputs.
        """

        def fun():
            rng = np.random.default_rng(seed)
            count = 0

            while True:
                for inputs in self:
                    if count == n_samples:
                        break
                    idx = rng.choice(2, inputs.shape[0]).astype("bool")
                    inputs = inputs[idx]
                    if count + inputs.shape[0] > n_samples:
                        inputs = inputs[: n_samples - count]
                    count += inputs.shape[0]
                    if inputs.shape[0] > 0:
                        yield inputs

                if count == n_samples:
                    break

        return self.from_callable_iterable(fun)

[docs]    def split(self, n_data: int) -> Tuple[InputsLoader, InputsLoader]:
        """
        Split an inputs loader into two inputs loaders.

        Parameters
        ----------
        n_data: int
            Number of data point after which the inputs loader should be split. The first returned inputs loader will
            contain exactly `n_data` inputs. The second one will contain the remaining ones.

        Returns
        -------
        Tuple[InputsLoader, InputsLoader]
            The two inputs loaders made out of the original one.
        """

        def inputs_loader1():
            count = 0
            for inputs in self:
                if count == n_data:
                    break
                if count + inputs.shape[0] <= n_data:
                    count += inputs.shape[0]
                    yield inputs
                else:
                    inputs = inputs[: n_data - count]
                    count = n_data
                    yield inputs

        def inputs_loader2():
            count = 0
            for inputs in self:
                if count > n_data:
                    yield inputs
                elif (count <= n_data) and (count + inputs.shape[0] > n_data):
                    count2 = count
                    count += inputs.shape[0]
                    inputs = inputs[n_data - count2 :]
                    yield inputs
                else:
                    count += inputs.shape[0]

        return self.from_callable_iterable(inputs_loader1), self.from_callable_iterable(
            inputs_loader2
        )


[docs]class TargetsLoader(BaseTargetsLoader):
[docs]    @classmethod
    def from_array_targets(
        cls,
        targets: Array,
        batch_size: Optional[int] = None,
        shuffle: bool = False,
        prefetch: bool = False,
    ) -> TargetsLoader:
        """
        Build a :class:`~fortuna.data.loader.TargetsLoader` object from an array of target data.

        Parameters
        ----------
        targets: Array
            Target array of data.
        batch_size: Optional[int]
            The batch size. If not given, the targets will not be batched.
        shuffle: bool
            Whether the target loader should shuffle at every call.
        prefetch: bool
            Whether to prefetch the next batch.

        Returns
        -------
        TargetsLoader
            A targets loader built out of the array of targets.
        """
        return cls(
            iterable=IterableData.from_array_data(
                targets, batch_size=batch_size, shuffle=shuffle, prefetch=prefetch
            )
        )

[docs]    def to_array_targets(self) -> Array:
        """
        Reduce a targets loader to an array of targets.

        Returns
        -------
        Array
            Array of target data.
        """
        targets = []
        for batch_targets in self:
            targets.append(batch_targets)
        return np.concatenate(targets, 0)

[docs]    def chop(self, divisor: int) -> TargetsLoader:
        """
        Chop the last part of each batch of the targets loader, to make sure the number od data points per batch divides
        `divisor`.

        Parameters
        ----------
        divisor : int
            Number of data points that each batched must divide.

        Returns
        -------
        TargetsLoader
            A targets loader with chopped batches.
        """

        def fun():
            for targets in self:
                reminder = targets.shape[0] % divisor
                if reminder == 0:
                    yield targets
                elif targets.shape[0] > divisor:
                    yield targets[:-reminder]

        return self.from_callable_iterable(fun)

[docs]    def sample(self, seed: int, n_samples: int) -> TargetsLoader:
        """
        Sample from the targets loader, with replacement.

        Parameters
        ----------
        seed: int
            Random seed.
        n_samples: int
            Number of samples.

        Returns
        -------
        TargetsLoader
            A targets loader made of the sampled targets.
        """

        def fun():
            rng = np.random.default_rng(seed)
            count = 0

            while True:
                for targets in self:
                    if count == n_samples:
                        break
                    idx = rng.choice(2, targets.shape[0]).astype("bool")
                    targets = targets[idx]
                    if count + targets.shape[0] > n_samples:
                        targets = targets[: n_samples - count]
                    count += targets.shape[0]
                    if targets.shape[0] > 0:
                        yield targets

                if count == n_samples:
                    break

        return self.from_callable_iterable(fun)

[docs]    def split(self, n_data: int) -> Tuple[TargetsLoader, TargetsLoader]:
        """
        Split a targets loader into two targets loaders.

        Parameters
        ----------
        n_data: int
            Number of data point after which the targets loader should be split. The first returned targets loader will
            contain exactly `n_data` targets. The second one will contain the remaining ones.

        Returns
        -------
        Tuple[TargetsLoader, TargetsLoader]
            The two targets loaders made out of the original one.
        """

        def targets_loader1():
            count = 0
            for targets in self:
                if count == n_data:
                    break
                if count + targets.shape[0] <= n_data:
                    count += targets.shape[0]
                    yield targets
                else:
                    targets = targets[: n_data - count]
                    count = n_data
                    yield targets

        def targets_loader2():
            count = 0
            for targets in self:
                if count > n_data:
                    yield targets
                elif (count <= n_data) and (count + targets.shape[0] > n_data):
                    count2 = count
                    count += targets.shape[0]
                    targets = targets[n_data - count2 :]
                    yield targets
                else:
                    count += targets.shape[0]

        return self.from_callable_iterable(
            targets_loader1
        ), self.from_callable_iterable(targets_loader2)