• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) Facebook, Inc. and its affiliates.
2# All rights reserved.
3#
4# This source code is licensed under the BSD-style license found in the
5# LICENSE file in the root directory of this source tree.
6
7"""
8Expiration timers are set up on the same process as the agent and
9used from your script to deal with stuck workers. When you go into
10a code-block that has the potential to get stuck you can acquire
11an expiration timer, which instructs the timer server to kill the
12process if it does not release the timer by the self-imposed expiration
13deadline.
14
15Usage::
16
17    import torchelastic.timer as timer
18    import torchelastic.agent.server as agent
19
20    def main():
21        start_method = "spawn"
22        message_queue = mp.get_context(start_method).Queue()
23        server = timer.LocalTimerServer(message, max_interval=0.01)
24        server.start() # non-blocking
25
26        spec = WorkerSpec(
27                    fn=trainer_func,
28                    args=(message_queue,),
29                    ...<OTHER_PARAMS...>)
30        agent = agent.LocalElasticAgent(spec, start_method)
31        agent.run()
32
33    def trainer_func(message_queue):
34        timer.configure(timer.LocalTimerClient(message_queue))
35        with timer.expires(after=60): # 60 second expiry
36            # do some work
37
38In the example above if ``trainer_func`` takes more than 60 seconds to
39complete, then the worker process is killed and the agent retries the worker group.
40"""
41
42from .api import (  # noqa: F401
43    configure,
44    expires,
45    TimerClient,
46    TimerRequest,
47    TimerServer,
48)
49from .file_based_local_timer import (  # noqa: F401
50    FileTimerClient,
51    FileTimerRequest,
52    FileTimerServer,
53)
54from .local_timer import LocalTimerClient, LocalTimerServer  # noqa: F401
55