How to assign numpy.ndarray to temporary variable under nogil loop in cython?
我正在尝试实施隐式推荐模型,但在代码运行时间计算前 5 条建议时遇到了问题,该建议针对超过 10 万个项目的约 11 万用户。
我能够通过 numpy 和一些 cython sparkles(在 jupyter notebook 中)部分解决问题。使用 numpy 排序的行仍然使用单核:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | %%cython -f # cython: language_level=3 # cython: boundscheck=False # cython: wraparound=False # cython: linetrace=True # cython: binding=True # distutils: define_macros=CYTHON_TRACE_NOGIL=1 from cython.parallel import parallel, prange import numpy as np from tqdm import tqdm def test(users_items=np.random.rand(11402139//1000, 134751//100) , int N=5, show_progress=True, int num_threads=1): # Define User count and loops indexes cdef int users_c = users_items.shape[0], u, i # Predefine zero 2-D C-ordered array for recommendations cdef int[:,::1] users_recs = np.zeros((users_c, N), dtype=np.intc) for u in tqdm(range(users_c), total=users_c, disable=not show_progress): # numpy .dot multiplication using multiple cores scores = np.random.rand(134751//1000, 10).dot(np.random.rand(10)) # numpy partial sort ids_partial = np.argpartition(scores, -N)[-N:] ids_top = ids_partial[np.argsort(scores[ids_partial])] # Fill predefined 2-D array for i in range(N): users_recs[u, i] = ids_top[i] return np.asarray(users_recs) # Working example tmp = test() |
我对其进行了分析 - np.argpartition 消耗 60% 的函数时间并使用 onde 核心。我正在尝试使其并行,因为我有一台具有 80 个内核的服务器。因此,我对用户子集(使用多核)执行 .dot 操作并计划通过并行的 numpy 排序结果(使用单核)填充空的预定义数组,但我遇到了问题标题中的错误:
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# cython: language_level=3
# cython: boundscheck=False
# cython: wraparound=False
# cython: linetrace=True
# cython: binding=True
# distutils: define_macros=CYTHON_TRACE_NOGIL=1
from cython.parallel import parallel, prange
import numpy as np
from tqdm import tqdm
from math import ceil
def test(int N=10, show_progress=True, int num_threads=1):
# Define User and Item count and loops indexes
cdef int users_c = 11402139//1000, items_c = 134751//100, u, i, u_b
# Predefine zero 2-D C-ordered array for recommendations
cdef int[:,::1] users_recs = np.zeros((users_c, N), dtype=np.intc)
# Define memoryview var
cdef float[:,::1] users_items_scores_mv
progress = tqdm(total=users_c, disable=not show_progress)
# For a batch of Users
for u_b in range(5):
# Use .dot operation which use multiple cores
users_items_scores = np.random.rand(num_threads, 10).dot(np.random.rand(134751//100, 10).T)
# Create memory view to 2-D array, which I'm trying to sort row wise
users_items_scores_mv = users_items_scores
# Here it starts, try to use numpy sorting in parallel
for u in prange(num_threads, nogil=True, num_threads=num_threads):
ids_partial = np.argpartition(users_items_scores_mv[u], items_c-N)[items_c-N:]
ids_top = ids_partial[np.argsort(users_items_scores_mv[u][ids_partial])]
# Fill predefined 2-D array
for i in range(N):
users_recs[u_b + u, i] = ids_top[i]
progress.update(num_threads)
progress.close()
return np.asarray(users_recs)
并得到这个(完全错误):
1 2 3 4 5 6 7 8 9 10 11 12 13 | Error compiling Cython file: ------------------------------------------------------------ ... # Create memory view to 2-D array, # which I'm trying to sort row wise users_items_scores_mv = users_items_scores # Here it starts, try to use numpy sorting in parallel for u in prange(num_threads, nogil=True, num_threads=num_threads): ids_partial = np.argpartition(users_items_scores_mv[u], items_c-N)[items_c-N:] ^ ------------------------------------------------------------ /datascc/enn/.cache/ipython/cython/_cython_magic_201b296cd5a34240b4c0c6ed3e58de7c.pyx:31:12: Assignment of Python object not allowed without gil |
我阅读了内存视图和 malloc-ating 但没有找到适用于我的情况的示例。
我最终得到了自定义 C 函数,它通过 openmp 与 nogil 并行填充 numpy 数组。它需要用 cython 重写 numpy\\'s argpartition 部分排序。算法是这样的(3-4个可以循环):