关于python:如何将numpy.ndarray分配给cython中nogil循环下的临时变量?

How to assign numpy.ndarray to temporary variable under nogil loop in cython?

我正在尝试实施隐式推荐模型,但在代码运行时间计算前 5 条建议时遇到了问题,该建议针对超过 10 万个项目的约 11 万用户。

我能够通过 numpy 和一些 cython sparkles(在 jupyter notebook 中)部分解决问题。使用 numpy 排序的行仍然使用单核:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
%%cython -f
# cython: language_level=3
# cython: boundscheck=False
# cython: wraparound=False
# cython: linetrace=True
# cython: binding=True
# distutils: define_macros=CYTHON_TRACE_NOGIL=1
from cython.parallel import parallel, prange
import numpy as np
from tqdm import tqdm

def test(users_items=np.random.rand(11402139//1000, 134751//100)
        , int N=5, show_progress=True, int num_threads=1):
    # Define User count and loops indexes
    cdef int users_c = users_items.shape[0], u, i
    # Predefine zero 2-D C-ordered array for recommendations
    cdef int[:,::1] users_recs = np.zeros((users_c, N), dtype=np.intc)
    for u in tqdm(range(users_c), total=users_c, disable=not show_progress):
        # numpy .dot multiplication using multiple cores
        scores = np.random.rand(134751//1000, 10).dot(np.random.rand(10))
        # numpy partial sort
        ids_partial = np.argpartition(scores, -N)[-N:]
        ids_top = ids_partial[np.argsort(scores[ids_partial])]
        # Fill predefined 2-D array
        for i in range(N):
            users_recs[u, i] = ids_top[i]
    return np.asarray(users_recs)
# Working example
tmp = test()

我对其进行了分析 - np.argpartition 消耗 60% 的函数时间并使用 onde 核心。我正在尝试使其并行,因为我有一台具有 80 个内核的服务器。因此,我对用户子集(使用多核)执行 .dot 操作并计划通过并行的 numpy 排序结果(使用单核)填充空的预定义数组,但我遇到了问题标题中的错误:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
%%cython -f
# cython: language_level=3
# cython: boundscheck=False
# cython: wraparound=False
# cython: linetrace=True
# cython: binding=True
# distutils: define_macros=CYTHON_TRACE_NOGIL=1
from cython.parallel import parallel, prange
import numpy as np
from tqdm import tqdm
from math import ceil
def test(int N=10, show_progress=True, int num_threads=1):
    # Define User and Item count and loops indexes
    cdef int users_c = 11402139//1000, items_c = 134751//100, u, i, u_b
    # Predefine zero 2-D C-ordered array for recommendations
    cdef int[:,::1] users_recs = np.zeros((users_c, N), dtype=np.intc)
    # Define memoryview var
    cdef float[:,::1] users_items_scores_mv
    progress = tqdm(total=users_c, disable=not show_progress)
    # For a batch of Users
    for u_b in range(5):
        # Use .dot operation which use multiple cores
        users_items_scores = np.random.rand(num_threads, 10).dot(np.random.rand(134751//100, 10).T)
        # Create memory view to 2-D array, which I'm trying to sort row wise
        users_items_scores_mv = users_items_scores
        # Here it starts, try to use numpy sorting in parallel
        for u in prange(num_threads, nogil=True, num_threads=num_threads):
            ids_partial = np.argpartition(users_items_scores_mv[u], items_c-N)[items_c-N:]
            ids_top = ids_partial[np.argsort(users_items_scores_mv[u][ids_partial])]
            # Fill predefined 2-D array
            for i in range(N):
                users_recs[u_b + u, i] = ids_top[i]
        progress.update(num_threads)
    progress.close()
    return np.asarray(users_recs)

并得到这个(完全错误):

1
2
3
4
5
6
7
8
9
10
11
12
13
Error compiling Cython file:
------------------------------------------------------------
...
        # Create memory view to 2-D array,
        # which I'm trying to sort row wise
        users_items_scores_mv = users_items_scores
        # Here it starts, try to use numpy sorting in parallel
        for u in prange(num_threads, nogil=True, num_threads=num_threads):
            ids_partial = np.argpartition(users_items_scores_mv[u], items_c-N)[items_c-N:]
           ^
------------------------------------------------------------

/datascc/enn/.cache/ipython/cython/_cython_magic_201b296cd5a34240b4c0c6ed3e58de7c.pyx:31:12: Assignment of Python object not allowed without gil

我阅读了内存视图和 malloc-ating 但没有找到适用于我的情况的示例。


我最终得到了自定义 C 函数,它通过 openmp 与 nogil 并行填充 numpy 数组。它需要用 cython 重写 numpy\\'s argpartition 部分排序。算法是这样的(3-4个可以循环):

  • 定义空数组 A[i,j] 和内存视图 B_mv[i,k];其中 "i" 是批量大小,"j" 一些列和 "k" 排序后要返回的所需项目数
  • 在 A 上创建指针