关于子进程:如何使用asyncio和current.futures.ProcessPoolExecutor终止Python中长时间运行的计算(CPU绑定任务)?

How to terminate long-running computation (CPU bound task) in Python using asyncio and concurrent.futures.ProcessPoolExecutor?

Similar Question (but answer does not work for me): How to cancel long-running subprocesses running using concurrent.futures.ProcessPoolExecutor?

与上面链接的问题和提供的解决方案不同,在我的情况下,计算本身相当长(CPU限制),并且无法循环运行以检查是否发生了某些事件。

以下代码的简化版本:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import asyncio
import concurrent.futures as futures
import time

class Simulator:
    def __init__(self):
        self._loop = None
        self._lmz_executor = None
        self._tasks = []
        self._max_execution_time = time.monotonic() + 60
        self._long_running_tasks = []

    def initialise(self):
        # Initialise the main asyncio loop
        self._loop = asyncio.get_event_loop()
        self._loop.set_default_executor(
            futures.ThreadPoolExecutor(max_workers=3))

        # Run separate processes of long computation task
        self._lmz_executor = futures.ProcessPoolExecutor(max_workers=3)

    def run(self):
        self._tasks.extend(
            [self.bot_reasoning_loop(bot_id) for bot_id in [1, 2, 3]]
        )

        try:
            # Gather bot reasoner tasks
            _reasoner_tasks = asyncio.gather(*self._tasks)
            # Send the reasoner tasks to main monitor task
            asyncio.gather(self.sample_main_loop(_reasoner_tasks))
            self._loop.run_forever()
        except KeyboardInterrupt:
            pass
        finally:
            self._loop.close()

    async def sample_main_loop(self, reasoner_tasks):
       """This is the main monitor task"""
        await asyncio.wait_for(reasoner_tasks, None)
        for task in self._long_running_tasks:
            try:
                await asyncio.wait_for(task, 10)
            except asyncio.TimeoutError:
                print("Oops. Some long operation timed out.")
                task.cancel()  # Doesn't cancel and has no effect
                task.set_result(None)  # Doesn't seem to have an effect

        self._lmz_executor.shutdown()
        self._loop.stop()
        print('And now I am done. Yay!')

    async def bot_reasoning_loop(self, bot):
        import math

        _exec_count = 0
        _sleepy_time = 15
        _max_runs = math.floor(self._max_execution_time / _sleepy_time)

        self._long_running_tasks.append(
            self._loop.run_in_executor(
                    self._lmz_executor, really_long_process, _sleepy_time))

        while time.monotonic() < self._max_execution_time:
            print("Bot#{}: thinking for {}s. Run {}/{}".format(
                    bot, _sleepy_time, _exec_count, _max_runs))
            await asyncio.sleep(_sleepy_time)
            _exec_count += 1

        print("Bot#{} Finished Thinking".format(bot))

def really_long_process(sleepy_time):
    print("I am a really long computation.....")
    _large_val = 9729379273492397293479237492734 ** 344323
    print("I finally computed this large value: {}".format(_large_val))

if __name__ =="__main__":
    sim = Simulator()
    sim.initialise()
    sim.run()

这个想法是,有一个主要的模拟循环运行并监视三个机器人线程。 然后,这些机器人线程中的每个线程都会执行一些推理,但也会使用ProcessPoolExecutor启动一个非常长的后台进程,这可能最终导致他们自己的阈值/最大执行时间更长,从而无法对事物进行推理。

如您在上面的代码中看到的,发生超时时,我尝试.cancel()这些任务。 尽管这并没有真正取消实际的计算,但这种计算一直在后台发生,并且asyncio循环直到所有长时间运行的计算完成后才终止。

如何终止方法中如此长时间运行的CPU绑定计算?

Other similar SO questions, but not necessarily related or helpful:

  • asyncio: Is it possible to cancel a future been run by an Executor?
  • How to terminate a single async task in multiprocessing if that single async task exceeds a threshold time in Python
  • Asynchronous multiprocessing with a worker pool in Python: how to keep going after timeout?

  • How do I terminate such long running CPU-bound computations within a method?

    您尝试的方法无效,因为ProcessPoolExecutor返回的期货无法取消。尽管asyncio的run_in_executor尝试传播取消,但是一旦任务开始执行,它就会被Future.cancel忽略。

    没有任何根本原因。与线程不同,进程可以安全地终止,因此ProcessPoolExecutor.submit完全有可能返回其cancel终止了相应进程的未来。 Asyncio协程已经定义了取消语义,并将自动使用它。不幸的是,ProcessPoolExecutor.submit返回一个常规的concurrent.futures.Future,它假定最低的公分母并将连续的未来视为不可触及。

    结果,要取消在子流程中执行的任务,必须完全避开ProcessPoolExecutor并管理自己的流程。面临的挑战是如何在不重新实现multiprocessing一半的情况下执行此操作。标准库提供的一个选项是为此目的(ab)使用(x),因为它支持可靠地关闭工作进程。 CancellablePool可以按以下方式工作:

    • 与其生成固定数量的进程,不如生成固定数量的1-worker池。
    • 从异步协程向池分配任务。如果在等待任务在另一个进程中完成时取消协程,请终止单进程池并创建一个新池。
    • 由于所有内容都是由单个asyncio线程协调的,因此不必担心争用情况,例如意外杀死已经开始执行另一任务的进程。 (如果要支持ProcessPoolExecutor中的取消,则需要防止这种情况。)

    这是该想法的示例实现:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    import asyncio
    import multiprocessing

    class CancellablePool:
        def __init__(self, max_workers=3):
            self._free = {self._new_pool() for _ in range(max_workers)}
            self._working = set()
            self._change = asyncio.Event()

        def _new_pool(self):
            return multiprocessing.Pool(1)

        async def apply(self, fn, *args):
           """
            Like multiprocessing.Pool.apply_async, but:
             * is an asyncio coroutine
             * terminates the process if cancelled
           """

            while not self._free:
                await self._change.wait()
                self._change.clear()
            pool = usable_pool = self._free.pop()
            self._working.add(pool)

            loop = asyncio.get_event_loop()
            fut = loop.create_future()
            def _on_done(obj):
                loop.call_soon_threadsafe(fut.set_result, obj)
            def _on_err(err):
                loop.call_soon_threadsafe(fut.set_exception, err)
            pool.apply_async(fn, args, callback=_on_done, error_callback=_on_err)

            try:
                return await fut
            except asyncio.CancelledError:
                pool.terminate()
                usable_pool = self._new_pool()
            finally:
                self._working.remove(pool)
                self._free.add(usable_pool)
                self._change.set()

        def shutdown(self):
            for p in self._working | self._free:
                p.terminate()
            self._free.clear()

    显示取消的简约测试用例:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    def really_long_process():
        print("I am a really long computation.....")
        large_val = 9729379273492397293479237492734 ** 344323
        print("I finally computed this large value: {}".format(large_val))

    async def main():
        loop = asyncio.get_event_loop()
        pool = CancellablePool()

        tasks = [loop.create_task(pool.apply(really_long_process))
                 for _ in range(5)]
        for t in tasks:
            try:
                await asyncio.wait_for(t, 1)
            except asyncio.TimeoutError:
                print('task timed out and cancelled')
        pool.shutdown()

    asyncio.get_event_loop().run_until_complete(main())

    请注意,CPU使用率从未超过3个内核,并且在测试即将结束时它如何开始下降,表明进程已按预期终止。

    要将其应用于问题代码,请将self._lmz_executor设为CancellablePool的实例,然后将self._loop.run_in_executor(...)更改为self._loop.create_task(self._lmz_executor.apply(...))