Source code for prismatique.worker.gpu

# -*- coding: utf-8 -*-
# Copyright 2024 Matthew Fitzpatrick.
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, version 3.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/gpl-3.0.html>.
r"""For specifying simulation parameters related to GPU workers.

Note that the documentation in this module draws from Refs. [Pryor1]_ and
[Hinitt1]_, directly copying certain passages when convenient.
"""



#####################################
## Load libraries/packages/modules ##
#####################################

# For validating and converting objects.
import czekitout.check
import czekitout.convert

# For defining classes that support enforced validation, updatability,
# pre-serialization, and de-serialization.
import fancytypes



# For recycling helper functions and/or constants.
import prismatique.worker.cpu



##################################
## Define classes and functions ##
##################################

# List of public objects in objects.
__all__ = ["Params"]



def _check_and_convert_num_gpus(params):
    obj_name = "num_gpus"
    kwargs = {"obj": params[obj_name], "obj_name": obj_name}
    num_gpus = czekitout.convert.to_nonnegative_int(**kwargs)

    return num_gpus



def _pre_serialize_num_gpus(num_gpus):
    obj_to_pre_serialize = num_gpus
    serializable_rep = obj_to_pre_serialize
    
    return serializable_rep



def _de_pre_serialize_num_gpus(serializable_rep):
    num_gpus = serializable_rep

    return num_gpus



def _check_and_convert_batch_size(params):
    module_alias = prismatique.worker.cpu
    func_alias = module_alias._check_and_convert_batch_size
    batch_size = func_alias(params)

    return batch_size



def _pre_serialize_batch_size(batch_size):
    obj_to_pre_serialize = batch_size
    module_alias = prismatique.worker.cpu
    func_alias = module_alias._pre_serialize_batch_size
    serializable_rep = func_alias(obj_to_pre_serialize)

    return serializable_rep



def _de_pre_serialize_batch_size(serializable_rep):
    module_alias = prismatique.worker.cpu
    func_alias = module_alias._de_pre_serialize_batch_size
    batch_size = func_alias(serializable_rep)

    return batch_size



def _check_and_convert_data_transfer_mode(params):
    obj_name = "data_transfer_mode"
    kwargs = {"obj": params[obj_name], "obj_name": obj_name}
    obj = czekitout.convert.to_str_from_str_like(**kwargs)

    kwargs = {"obj": obj,
              "obj_name": obj_name,
              "accepted_strings": ("single-transfer", "streaming", "auto")}
    czekitout.check.if_one_of_any_accepted_strings(**kwargs)

    data_transfer_mode = obj
    
    return data_transfer_mode



def _pre_serialize_data_transfer_mode(data_transfer_mode):
    obj_to_pre_serialize = data_transfer_mode
    serializable_rep = obj_to_pre_serialize

    return serializable_rep



def _de_pre_serialize_data_transfer_mode(serializable_rep):
    data_transfer_mode = serializable_rep

    return data_transfer_mode



def _check_and_convert_num_streams_per_gpu(params):
    obj_name = "num_streams_per_gpu"
    kwargs = {"obj": params[obj_name], "obj_name": obj_name}
    num_streams_per_gpu = czekitout.convert.to_positive_int(**kwargs)

    return num_streams_per_gpu



def _pre_serialize_num_streams_per_gpu(num_streams_per_gpu):
    obj_to_pre_serialize = num_streams_per_gpu
    serializable_rep = obj_to_pre_serialize
    
    return serializable_rep



def _de_pre_serialize_num_streams_per_gpu(serializable_rep):
    num_streams_per_gpu = serializable_rep

    return num_streams_per_gpu



_default_num_gpus = \
    4
_default_batch_size = \
    1
_default_data_transfer_mode = \
    "auto"
_default_num_streams_per_gpu = \
    3
_default_skip_validation_and_conversion = \
    False



[docs]class Params(fancytypes.PreSerializableAndUpdatable):
    r"""The simulation parameters related to GPU workers.

    Parameters
    ----------
    num_gpus : `int`, optional
        Let ``num_available_gpus`` be the number of GPU devices available for
        the simulation. ``max(num_available_gpus, num_gpus)`` determines the
        number of GPUs that are to be used in the simulation. See the
        documentation for the class :class:`prismatique.worker.Params` for a 
        discussion on how the number of GPU devices affects performance.
    batch_size : `int`, optional
        The calculation of the transmission of a single probe or plane wave 
        through the entire sample (i.e. from the incident surface to the exit 
        surface) involves a series of fast-Fourier transform (FFT) operations. 
        FFTs are calculated using a divide-and-conquer algorithm that 
        recursively breaks down a discrete Fourier transform (DFT) into smaller 
        DFTs and performs multiplications involving complex roots of unity 
        called twiddle factors. Thus, a given FFT in this scheme is calculated 
        in multiple steps. The libraries used in ``prismatic`` that implement 
        FFTs support batch FFTs, whereby multiple Fourier transforms of the same
        size can be computed simultaneously. By simultaneously, we mean that 
        step :math:`i+1` of a given FFT in a given batch cannot be executed 
        until step :math:`i` has been executed for all FFTs in said batch. This 
        order of operations allows for reuse of intermediate twiddle factors, 
        resulting in a faster overall computation than performing individual
        transforms one-by-one at the expense of requiring a larger block of
        memory to store the multiple arrays. We can therefore use this batch
        FFT method to calculate the transmission of a batch of probes or plane
        waves simultaneously in the same sense as that articulated above.

        If ``num_gpus`` has been set to a positive integer`, then ``batch_size``
        specifies the number of probes or plane waves to transmit simultaneously
        per GPU device. If ``num_gpus`` has been set to ``0``, then the
        parameter ``batch_size`` is ignored upon configuring the simulation. 
    data_transfer_mode : ``"single-transfer"`` | ``"streaming"`` | ``"auto"``, optional
        The preferred way to perform simulations is to transfer large data
        structures such as the projected potential array or the compact 
        scattering matrices to each GPU only once, where they can then be read
        from repeatedly over the course of the calculation. However, this
        requires that the arrays fit into the limited GPU memory. For 
        simulations that are too large, ``prismatic`` has implemented an
        asynchronous streaming version for simulations. A stream is a sequence
        of operations which are processed in order; however, different streams
        can execute out of order with respect to one another. These operations
        include kernel executions and memory transfers. Each GPU device
        can manage multiple streams, where each stream may use some subset of
        the threads in said GPU device. Since only one kernel is able to run on
        a given GPU device at any one time, a queue of streams can be formed
        such that the memory copies of one stream can overlap with the kernel
        execution of another stream as depicted in 
        :numref:`worker_gpu_params_illustrating_streaming`.

        .. _worker_gpu_params_illustrating_streaming:
        .. figure:: ../_images/illustrating_streaming.png

           Depiction of streaming execution. Figure taken from Ref. [Hinitt1]_.

        In streaming mode, rather than allocate and transfer a single read-only 
        copy of large arrays, buffers are allocated to each stream large enough 
        to hold only the relevant subset of the data for the current step in the
        calculation, and the job itself triggers asynchronous streaming of the 
        data it requires for the next step. The use of asynchronous memory 
        copies and CUDA streams permits the partial hiding of memory transfer 
        latencies behind kernel execution.

        By default, ``data_transfer_mode`` is set to ``"auto"``, which signals
        ``prismatic`` to use an automatic procedure to determine whether to use
        the single-transfer or streaming mode, whereby the input parameters are
        used to estimate how much memory will be consumed on the device, and if
        this estimate is too large compared with the available device memory
        then the streaming mode is used. Users can manually select streaming
        mode by setting ``data_transfer_mode`` to ``"streaming"``, or if memory
        permits so, users can also manually select single-transfer mode by
        setting ``data_transfer_mode`` to ``"single-transfer"``. If ``num_gpus``
        has been set to ``0``, then the parameter ``data_transfer_mode`` is 
        ignored upon configuring the simulation.
    num_streams_per_gpu : `int`, optional
        If ``num_gpus`` has been set to a positive integer` and streaming
        mode has been enabled, then ``num_streams_per_gpu`` specifies the number
        of CUDA streams per GPU device. If ``num_gpus`` has been set to ``0`` or
        streaming mode has not been enabled, then the parameter 
        ``num_streams_per_gpu`` is ignored upon configuring the simulation. 
    skip_validation_and_conversion : `bool`, optional
        Let ``validation_and_conversion_funcs`` and ``core_attrs`` denote the
        attributes :attr:`~fancytypes.Checkable.validation_and_conversion_funcs`
        and :attr:`~fancytypes.Checkable.core_attrs` respectively, both of which
        being `dict` objects.

        Let ``params_to_be_mapped_to_core_attrs`` denote the `dict`
        representation of the constructor parameters excluding the parameter
        ``skip_validation_and_conversion``, where each `dict` key ``key`` is a
        different constructor parameter name, excluding the name
        ``"skip_validation_and_conversion"``, and
        ``params_to_be_mapped_to_core_attrs[key]`` would yield the value of the
        constructor parameter with the name given by ``key``.

        If ``skip_validation_and_conversion`` is set to ``False``, then for each
        key ``key`` in ``params_to_be_mapped_to_core_attrs``,
        ``core_attrs[key]`` is set to ``validation_and_conversion_funcs[key]
        (params_to_be_mapped_to_core_attrs)``.

        Otherwise, if ``skip_validation_and_conversion`` is set to ``True``,
        then ``core_attrs`` is set to
        ``params_to_be_mapped_to_core_attrs.copy()``. This option is desired
        primarily when the user wants to avoid potentially expensive deep copies
        and/or conversions of the `dict` values of
        ``params_to_be_mapped_to_core_attrs``, as it is guaranteed that no
        copies or conversions are made in this case.

    """
    ctor_param_names = ("num_gpus",
                        "batch_size",
                        "data_transfer_mode",
                        "num_streams_per_gpu")
    kwargs = {"namespace_as_dict": globals(),
              "ctor_param_names": ctor_param_names}
    
    _validation_and_conversion_funcs_ = \
        fancytypes.return_validation_and_conversion_funcs(**kwargs)
    _pre_serialization_funcs_ = \
        fancytypes.return_pre_serialization_funcs(**kwargs)
    _de_pre_serialization_funcs_ = \
        fancytypes.return_de_pre_serialization_funcs(**kwargs)

    del ctor_param_names, kwargs

    
    
    def __init__(self,
                 num_gpus=\
                 _default_num_gpus,
                 batch_size=\
                 _default_batch_size,
                 data_transfer_mode=\
                 _default_data_transfer_mode,
                 num_streams_per_gpu=\
                 _default_num_streams_per_gpu,
                 skip_validation_and_conversion=\
                 _default_skip_validation_and_conversion):
        ctor_params = {key: val
                       for key, val in locals().items()
                       if (key not in ("self", "__class__"))}
        kwargs = ctor_params
        kwargs["skip_cls_tests"] = True
        fancytypes.PreSerializableAndUpdatable.__init__(self, **kwargs)

        return None



[docs]    @classmethod
    def get_validation_and_conversion_funcs(cls):
        validation_and_conversion_funcs = \
            cls._validation_and_conversion_funcs_.copy()

        return validation_and_conversion_funcs


    
[docs]    @classmethod
    def get_pre_serialization_funcs(cls):
        pre_serialization_funcs = \
            cls._pre_serialization_funcs_.copy()

        return pre_serialization_funcs


    
[docs]    @classmethod
    def get_de_pre_serialization_funcs(cls):
        de_pre_serialization_funcs = \
            cls._de_pre_serialization_funcs_.copy()

        return de_pre_serialization_funcs



def _check_and_convert_gpu_params(params):
    obj_name = "gpu_params"
    obj = params[obj_name]

    accepted_types = (Params, type(None))
    
    if isinstance(obj, accepted_types[-1]):
        gpu_params = accepted_types[0]()
    else:
        kwargs = {"obj": obj,
                  "obj_name": obj_name,
                  "accepted_types": accepted_types}
        czekitout.check.if_instance_of_any_accepted_types(**kwargs)

        kwargs = obj.get_core_attrs(deep_copy=False)
        gpu_params = accepted_types[0](**kwargs)

    return gpu_params



def _pre_serialize_gpu_params(gpu_params):
    obj_to_pre_serialize = gpu_params
    serializable_rep = obj_to_pre_serialize.pre_serialize()
    
    return serializable_rep



def _de_pre_serialize_gpu_params(serializable_rep):
    gpu_params = Params.de_pre_serialize(serializable_rep)
    
    return gpu_params



_default_gpu_params = None
    


###########################
## Define error messages ##
###########################