用于多个参数的python multiprocessing pool.map

Python multiprocessing pool.map for multiple arguments

在python多处理库中，是否有pool.map的变体支持多个参数？

1
2
3
4
5
6
7
8
9
10
11

text ="test"
def harvester(text, case):
X = case[0]
text+ str(X)

if __name__ == '__main__':
pool = multiprocessing.Pool(processes=6)
case = RAW_DATASET
pool.map(harvester(text,case),case, 1)
pool.close()
pool.join()

相关讨论

is there a variant of pool.map which support multiple arguments?

python 3.3包括pool.starmap()方法：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

#!/usr/bin/env python3
from functools import partial
from itertools import repeat
from multiprocessing import Pool, freeze_support

def func(a, b):
return a + b

def main():
a_args = [1,2,3]
second_arg = 1
with Pool() as pool:
L = pool.starmap(func, [(1, 1), (2, 1), (3, 1)])
M = pool.starmap(func, zip(a_args, repeat(second_arg)))
N = pool.map(partial(func, b=second_arg), a_args)
assert L == M == N

if __name__=="__main__":
freeze_support()
main()

对于旧版本：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

#!/usr/bin/env python2
import itertools
from multiprocessing import Pool, freeze_support

def func(a, b):
print a, b

def func_star(a_b):
"""Convert `f([1,2])` to `f(1,2)` call."""
return func(*a_b)

def main():
pool = Pool()
a_args = [1,2,3]
second_arg = 1
pool.map(func_star, itertools.izip(a_args, itertools.repeat(second_arg)))

if __name__=="__main__":
freeze_support()
main()

产量

1
2
3

1 1
2 1
3 1

注意这里是如何使用itertools.izip()和itertools.repeat()的。

由于@unutbu提到的bug，您不能在python 2.6上使用functools.partial()或类似的功能，所以应该明确定义简单的包装函数func_star()。另见uptimebox建议的解决方法。

相关讨论

答案取决于版本和情况。J.F.Sebastian.1首先描述了最近版本的python(自3.3以来)最一般的答案，它使用了Pool.starmap方法，它接受一系列参数元组。然后，它自动解包每个元组中的参数，并将它们传递给给定的函数：

1
2
3
4
5
6
7
8
9
10
11
12
13

import multiprocessing
from itertools import product

def merge_names(a, b):
return '{} & {}'.format(a, b)

if __name__ == '__main__':
names = ['Brown', 'Wilson', 'Bartlett', 'Rivera', 'Molloy', 'Opie']
with multiprocessing.Pool(processes=3) as pool:
results = pool.starmap(merge_names, product(names, repeat=2))
print(results)

# Output: ['Brown & Brown', 'Brown & Wilson', 'Brown & Bartlett', ...

对于早期版本的Python，您需要编写一个助手函数来显式地解包参数。如果您想使用with，还需要编写一个包装器来将Pool转换为上下文管理器。(感谢Muon指出这一点。)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

import multiprocessing
from itertools import product
from contextlib import contextmanager

def merge_names(a, b):
return '{} & {}'.format(a, b)

def merge_names_unpack(args):
return merge_names(*args)

@contextmanager
def poolcontext(*args, **kwargs):
pool = multiprocessing.Pool(*args, **kwargs)
yield pool
pool.terminate()

if __name__ == '__main__':
names = ['Brown', 'Wilson', 'Bartlett', 'Rivera', 'Molloy', 'Opie']
with poolcontext(processes=3) as pool:
results = pool.map(merge_names_unpack, product(names, repeat=2))
print(results)

# Output: ['Brown & Brown', 'Brown & Wilson', 'Brown & Bartlett', ...

在更简单的情况下，使用固定的第二个参数，也可以使用partial，但只能在python 2.7+中使用。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

import multiprocessing
from functools import partial
from contextlib import contextmanager

@contextmanager
def poolcontext(*args, **kwargs):
pool = multiprocessing.Pool(*args, **kwargs)
yield pool
pool.terminate()

def merge_names(a, b):
return '{} & {}'.format(a, b)

if __name__ == '__main__':
names = ['Brown', 'Wilson', 'Bartlett', 'Rivera', 'Molloy', 'Opie']
with poolcontext(processes=3) as pool:
results = pool.map(partial(merge_names, b='Sons'), names)
print(results)

# Output: ['Brown & Sons', 'Wilson & Sons', 'Bartlett & Sons', ...

1。这其中的大部分灵感来自于他的回答，而他的回答本应被接受。不过，既然这本书一直位居榜首，似乎最好还是为将来的读者改进一下。

相关讨论

我想下面会更好

1
2
3
4
5
6
7
8
9

def multi_run_wrapper(args):
return add(*args)
def add(x,y):
return x+y
if __name__ =="__main__":
from multiprocessing import Pool
pool = Pool(4)
results = pool.map(multi_run_wrapper,[(1,2),(2,3),(3,4)])
print results

输出

[3, 5, 7]

相关讨论

在pool.starmap():中使用python 3.3+

1
2
3
4
5
6
7
8
9
10
11
12

from multiprocessing.dummy import Pool as ThreadPool

def write(i, x):
print(i,"---", x)

a = ["1","2","3"]
b = ["4","5","6"]

pool = ThreadPool(2)
pool.starmap(write, zip(a,b))
pool.close()
pool.join()

结果：

1
2
3

1 --- 4
2 --- 5
3 --- 6

如果您愿意，还可以压缩()更多参数：zip(a,b,c,d,e)。

如果您希望将常量值作为参数传递，则必须使用import itertools，然后使用zip(itertools.repeat(constant), a)。

相关讨论

在J.F.Sebastian Answer中了解了Itertools之后，我决定更进一步，编写一个负责并行化的parmap包，在python-2.7和python-3.2(以及以后的版本)上提供map和starmap函数，这些函数可以接受任意数量的位置参数。

安装

1	pip install parmap

如何并行化：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

import parmap
# If you want to do:
y = [myfunction(x, argument1, argument2) for x in mylist]
# In parallel:
y = parmap.map(myfunction, mylist, argument1, argument2)

# If you want to do:
z = [myfunction(x, y, argument1, argument2) for (x,y) in mylist]
# In parallel:
z = parmap.starmap(myfunction, mylist, argument1, argument2)

# If you want to do:
listx = [1, 2, 3, 4, 5, 6]
listy = [2, 3, 4, 5, 6, 7]
param = 3.14
param2 = 42
listz = []
for (x, y) in zip(listx, listy):
listz.append(myfunction(x, y, param1, param2))
# In parallel:
listz = parmap.starmap(myfunction, zip(listx, listy), param1, param2)

我已经将parmap上传到pypi和github存储库。

例如，问题的答案如下：

1
2
3
4
5
6
7
8
9

import parmap

def harvester(case, text):
X = case[0]
text+ str(X)

if __name__ =="__main__":
case = RAW_DATASET # assuming this is an iterable
parmap.map(harvester, case,"test", chunksize=1)

multiprocessing的一个分支称为pathos(注意：使用github上的版本)，它不需要starmap——map函数为python的map镜像API，因此map可以接受多个参数。使用pathos时，通常也可以在解释器中进行多处理，而不是卡在__main__块中。Pathos将在一些温和的更新之后发布——主要是转换为python 3.x。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

Python 2.7.5 (default, Sep 30 2013, 20:15:49)
[GCC 4.2.1 (Apple Inc. build 5566)] on darwin
Type"help","copyright","credits" or"license" for more information.
>>> def func(a,b):
... print a,b
...
>>>
>>> from pathos.multiprocessing import ProcessingPool
>>> pool = ProcessingPool(nodes=4)
>>> pool.map(func, [1,2,3], [1,1,1])
1 1
2 1
3 1
[None, None, None]
>>>
>>> # also can pickle stuff like lambdas
>>> result = pool.map(lambda x: x**2, range(10))
>>> result
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
>>>
>>> # also does asynchronous map
>>> result = pool.amap(pow, [1,2,3], [4,5,6])
>>> result.get()
[1, 32, 729]
>>>
>>> # or can return a map iterator
>>> result = pool.imap(pow, [1,2,3], [4,5,6])
>>> result
<processing.pool.IMapIterator object at 0x110c2ffd0>
>>> list(result)
[1, 32, 729]

可以使用以下两个函数，以避免为每个新函数编写包装器：

1
2
3
4
5
6
7
8
9

import itertools
from multiprocessing import Pool

def universal_worker(input_pair):
function, args = input_pair
return function(*args)

def pool_args(function, *args):
return zip(itertools.repeat(function), zip(*args))

将函数function与参数arg_0、arg_1和arg_2列表一起使用，如下所示：

1
2
3
4

pool = Pool(n_core)
list_model = pool.map(universal_worker, pool_args(function, arg_0, arg_1, arg_2)
pool.close()
pool.join()

更好的方法是使用decorator而不是手工编写包装函数。特别是当您有很多函数需要映射时，decorator将避免为每个函数编写包装器，从而节省您的时间。通常装饰的功能是不可选择的，但是我们可以使用functools来绕过它。这里有更多的争论。

这里举个例子

1
2
3
4
5
6
7
8
9
10
11
12
13

def unpack_args(func):
from functools import wraps
@wraps(func)
def wrapper(args):
if isinstance(args, dict):
return func(**args)
else:
return func(*args)
return wrapper

@unpack_args
def func(x, y):
return x + y

然后你可以用压缩参数映射它

1
2
3
4
5

np, xlist, ylist = 2, range(10), range(10)
pool = Pool(np)
res = pool.map(func, zip(xlist, ylist))
pool.close()
pool.join()

当然，您可以在python 3(>=3.3)中使用Pool.starmap，如其他答案中所述。

相关讨论

另一个简单的选择是将函数参数包装成一个元组，然后将应该传递给元组的参数包装起来。在处理大数据块时，这可能不理想。我相信它会为每个元组制作副本。

1
2
3
4
5
6
7
8
9
10
11
12

from multiprocessing import Pool

def f((a,b,c,d)):
print a,b,c,d
return a + b + c +d

if __name__ == '__main__':
p = Pool(10)
data = [(i+0,i+1,i+2,i+3) for i in xrange(10)]
print(p.map(f, data))
p.close()
p.join()

以随机顺序给出输出：

1
2
3
4
5
6
7
8
9
10
11

0 1 2 3
1 2 3 4
2 3 4 5
3 4 5 6
4 5 6 7
5 6 7 8
7 8 9 10
6 7 8 9
8 9 10 11
9 10 11 12
[6, 10, 14, 18, 22, 26, 30, 34, 38, 42]

相关讨论

一种更好的治疗Python2的方法：

1
2
3
4
5
6

from multiprocessing import Pool
def func((i, (a, b))):
print i, a, b
return a + b
pool = Pool(3)
pool.map(func, [(0,(1,2)), (1,(2,3)), (2,(3, 4))])

2 3 3

1 2 2

0 1 1

出[ ]：

〔3, 5, 7〕

#"如何接受多个参数"。

1
2
3
4
5
6
7
8
9
10

def f1(args):
a, b, c = args[0] , args[1] , args[2]
return a+b+c

if __name__ =="__main__":
import multiprocessing
pool = multiprocessing.Pool(4)

result1 = pool.map(f1, [ [1,2,3] ])
print(result1)

另一种方法是将列表传递给一个参数例程：

1
2
3
4
5
6
7
8
9
10
11
12
13
14

import os
from multiprocessing import Pool

def task(args):
print"PID =", os.getpid(),", arg1 =", args[0],", arg2 =", args[1]

pool = Pool()

pool.map(task, [
[1,2],
[3,4],
[5,6],
[7,8]
])

我们可以用自己喜欢的方法构造参数列表。

相关讨论

从python 3.4.4中，可以使用multiprocessing.get_context()获取上下文对象，以使用多个start方法：

1
2
3
4
5
6
7
8
9
10
11
12
13

import multiprocessing as mp

def foo(q, h, w):
q.put(h + ' ' + w)
print(h + ' ' + w)

if __name__ == '__main__':
ctx = mp.get_context('spawn')
q = ctx.Queue()
p = ctx.Process(target=foo, args=(q,'hello', 'world'))
p.start()
print(q.get())
p.join()

或者你只是简单地替换

1	pool.map(harvester(text,case),case, 1)

通过：

1	pool.apply_async(harvester(text,case),case, 1)

这里有很多答案，但似乎没有一个提供可以在任何版本上工作的python2/3兼容代码。如果您希望您的代码能够正常工作，这对于任何一个Python版本都是有效的：

1
2
3
4
5
6
7
8
9
10
11

# For python 2/3 compatibility, define pool context manager
# to support the 'with' statement in Python 2
if sys.version_info[0] == 2:
from contextlib import contextmanager
@contextmanager
def multiprocessing_context(*args, **kwargs):
pool = multiprocessing.Pool(*args, **kwargs)
yield pool
pool.terminate()
else:
multiprocessing_context = multiprocessing.Pool

之后，您可以使用多处理常规的python3方式，不管您喜欢什么。例如：

1
2
3
4

def _function_to_run_for_each(x):
return x.lower()
with multiprocessing_context(processes=3) as pool:
results = pool.map(_function_to_run_for_each, ['Bob', 'Sue', 'Tim']) print(results)

将在python2或python3中工作。

在官方文件中，它只支持一个不可辩驳的论点。我喜欢在这种情况下使用Apply_Async。在你的情况下，我会这样做：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

from multiprocessing import Process, Pool, Manager

text ="test"
def harvester(text, case, q = None):
X = case[0]
res = text+ str(X)
if q:
q.put(res)
return res

def block_until(q, results_queue, until_counter=0):
i = 0
while i < until_counter:
results_queue.put(q.get())
i+=1

if __name__ == '__main__':
pool = multiprocessing.Pool(processes=6)
case = RAW_DATASET
m = Manager()
q = m.Queue()
results_queue = m.Queue() # when it completes results will reside in this queue
blocking_process = Process(block_until, (q, results_queue, len(case)))
blocking_process.start()
for c in case:
try:
res = pool.apply_async(harvester, (text, case, q = None))
res.get(timeout=0.1)
except:
pass
blocking_process.join()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

text ="test"

def unpack(args):
return args[0](*args[1:])

def harvester(text, case):
X = case[0]
text+ str(X)

if __name__ == '__main__':
pool = multiprocessing.Pool(processes=6)
case = RAW_DATASET
# args is a list of tuples
# with the function to execute as the first item in each tuple
args = [(harvester, text, c) for c in case]
# doing it this way, we can pass any function
# and we don't need to define a wrapper for each different function
# if we need to use more than one
pool.map(unpack, args)
pool.close()
pool.join()

这是我用于将多个参数传递给pool.imap fork中使用的单参数函数的例程示例：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

from multiprocessing import Pool

# Wrapper of the function to map:
class makefun:
def __init__(self, var2):
self.var2 = var2
def fun(self, i):
var2 = self.var2
return var1[i] + var2

# Couple of variables for the example:
var1 = [1, 2, 3, 5, 6, 7, 8]
var2 = [9, 10, 11, 12]

# Open the pool:
pool = Pool(processes=2)

# Wrapper loop
for j in range(len(var2)):
# Obtain the function to map
pool_fun = makefun(var2[j]).fun

# Fork loop
for i, value in enumerate(pool.imap(pool_fun, range(len(var1))), 0):
print(var1[i], '+' ,var2[j], '=', value)

# Close the pool
pool.close()

对于python2，你可以使用这个技巧

1
2
3
4
5
6

def fun(a,b):
return a+b

pool = multiprocessing.Pool(processes=6)
b=233
pool.map(lambda x:fun(x,b),range(1000))