1# Copyright (c) Facebook, Inc. and its affiliates. 2# All rights reserved. 3# 4# This source code is licensed under the BSD-style license found in the 5# LICENSE file in the root directory of this source tree. 6 7""" 8Expiration timers are set up on the same process as the agent and 9used from your script to deal with stuck workers. When you go into 10a code-block that has the potential to get stuck you can acquire 11an expiration timer, which instructs the timer server to kill the 12process if it does not release the timer by the self-imposed expiration 13deadline. 14 15Usage:: 16 17 import torchelastic.timer as timer 18 import torchelastic.agent.server as agent 19 20 def main(): 21 start_method = "spawn" 22 message_queue = mp.get_context(start_method).Queue() 23 server = timer.LocalTimerServer(message, max_interval=0.01) 24 server.start() # non-blocking 25 26 spec = WorkerSpec( 27 fn=trainer_func, 28 args=(message_queue,), 29 ...<OTHER_PARAMS...>) 30 agent = agent.LocalElasticAgent(spec, start_method) 31 agent.run() 32 33 def trainer_func(message_queue): 34 timer.configure(timer.LocalTimerClient(message_queue)) 35 with timer.expires(after=60): # 60 second expiry 36 # do some work 37 38In the example above if ``trainer_func`` takes more than 60 seconds to 39complete, then the worker process is killed and the agent retries the worker group. 40""" 41 42from .api import ( # noqa: F401 43 configure, 44 expires, 45 TimerClient, 46 TimerRequest, 47 TimerServer, 48) 49from .file_based_local_timer import ( # noqa: F401 50 FileTimerClient, 51 FileTimerRequest, 52 FileTimerServer, 53) 54from .local_timer import LocalTimerClient, LocalTimerServer # noqa: F401 55