Source code for miprometheus.grid_workers.grid_trainer_gpu

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright (C) IBM Corporation 2018
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
grid_trainer_gpu.py:

    - This file contains the implementation of a worker spanning a grid of training experiments on \
     a collection of CUDA devices.
    - It works by loading a template yaml file, modifying the resulting dict, and dumping \
     that as yaml into a temporary file. The specified :py:class:`miprometheus.workers.Trainer` is then \
    executed using the temporary yaml file as the task. This grid trainer will run as many concurrent jobs as possible.

"""

__author__ = "Alexis Asseman, Younes Bouhadjar, Vincent Marois"

import shutil
import torch
from time import sleep
from functools import partial
from multiprocessing.pool import ThreadPool

from miprometheus.grid_workers.grid_trainer_cpu import GridTrainerCPU


[docs]class GridTrainerGPU(GridTrainerCPU):
    """
    Grid Worker managing several training experiments on GPUs.

    Reuses a :py:class:`miprometheus.workers.Trainer` (can specify :py:class:`miprometheus.workers.OfflineTrainer` \
    or :py:class:`miprometheus.workers.OnlineTrainer`) to start one experiment.

    Inherits from :py:class:`miprometheus.grid_workers.GridTrainerCPU` as the constructor & \
    :py:func:`GridTrainerCPU.setup_grid_experiment` are identical.

    """
[docs]    def __init__(self, name="GridTrainerGPU", use_gpu=True):
        """
        Constructor for the :py:class:`miprometheus.grid_workers.GridTrainerGPU`:

            - Calls the constructor of :py:class:`miprometheus.grid_workers.GridTrainerCPU` as it is identical.


        :param name: Name of the worker (DEFAULT: "GridTrainerGPU").
        :type name: str

        :param use_gpu: Indicates whether the worker should use GPU or not.
        :type use_gpu: bool

        """
        # Call the base constructor.
        super(GridTrainerGPU, self).__init__(name=name,use_gpu=use_gpu)

[docs]    def setup_grid_experiment(self):
        """
        Setups a specific experiment.

        - Calls :py:func:`GridTrainerGPU.setup_grid_experiment()` to parse arguments, parse config files etc.

        - Checks the presence of CUDA-compatible devices.

        """
        super(GridTrainerGPU, self).setup_grid_experiment()

        # Check the presence of the CUDA-compatible devices.
        if torch.cuda.device_count() == 0:
            self.logger.error("Cannot use GPU as there are no CUDA-compatible devices present in the system!")
            exit(-1)

[docs]    def run_grid_experiment(self):
        """
        Main function of the :py:class:`miprometheus.grid_workers.GridTrainerGPU`.

        Maps the grid experiments to CUDA devices in the limit of the maximum concurrent runs allowed.

        """
        try:

            # Check the presence of cuda-gpupick
            if shutil.which('cuda-gpupick') is not None:
                prefix_str = "cuda-gpupick -n1 "
            else:
                self.logger.warning("Cannot localize the 'cuda-gpupick' script, not using it.")
                prefix_str = ''

            # Check max number of child processes. 
            if self.max_concurrent_runs <= 0:  # We need at least one process!
                max_processes = torch.cuda.device_count()
            else:    
                # Take into account the minimum value.
                max_processes = min(torch.cuda.device_count(), self.max_concurrent_runs)
            self.logger.info('Spanning experiments using {} GPU(s) concurrently.'.format(max_processes))

            # Run in as many threads as there are GPUs available to the script.
            with ThreadPool(processes=max_processes) as pool:
                # This contains a list of `AsyncResult` objects. To check if completed and get result.
                thread_results = []

                for task in self.experiments_list:
                    func = partial(GridTrainerGPU.run_experiment, self, prefix=prefix_str)
                    thread_results.append(pool.apply_async(func, (task,)))

                    # Check every 3 seconds if there is a (supposedly) free GPU to start a task on
                    sleep(3)
                    while [r.ready() for r in thread_results].count(False) >= max_processes:
                        sleep(3)

                # Equivalent of what would usually be called "join" for threads
                for r in thread_results:
                    r.wait()

            self.logger.info('Grid training finished')

        except KeyboardInterrupt:
            self.logger.info('Grid training interrupted!')


def main():
    """
    Entry point function for the :py:class:`miprometheus.grid_workers.GridTrainerGPU`.

    """
    grid_trainer_gpu = GridTrainerGPU()

    # parse args, load configuration and create all required objects.
    grid_trainer_gpu.setup_grid_experiment()

    # GO!
    grid_trainer_gpu.run_grid_experiment()


if __name__ == '__main__':

    main()