关于过程:如何在Erlang中高效地实现MapReduce示例?

How to implement the MapReduce example in Erlang efficiently?

我正在尝试比较并发编程语言(例如Haskell,Go和Erlang)的性能。下面的Go代码计算平方和(重复计算R次的平方和):

1^2+2^2+3^2....1024^2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
package main

import"fmt"

func mapper(in chan int, out chan int) {
    for v := range in {out <- v*v}
}

func reducer(in1, in2 chan int, out chan int) {
    for i1 := range in1 {i2 := <- in2; out <- i1 + i2}
}

func main() {
    const N = 1024  // calculate sum of squares up to N; N must be power of 2
    const R = 10  // number of repetitions to fill the"pipe"

    var r [N*2]chan int
    for i := range r {r[i] = make(chan int)}
    var m [N]chan int
    for i := range m {m[i] = make(chan int)}

    for i := 0; i < N; i++ {go mapper(m[i], r[i + N])}
    for i := 1; i < N; i++ {go reducer(r[i * 2], r[i *2 + 1], r[i])}

    go func () {
        for j := 0; j < R; j++ {
            for i := 0; i < N; i++ {m[i] <- i + 1}
        }
    } ()

    for j := 0; j < R; j++ {
        <- r[1]
    }
}

以下代码是Erlang中的MapReduce解决方案。我是Erlang的新手。我想比较Go,Haskell和Erlang的性能。我的问题是如何优化此Erlang代码。我使用erlc -W mr.erl编译该代码,并使用erl -noshell -s mr start -s init stop -extra 1024 1024运行该代码。是否有用于优化的特殊编译和执行选项?我非常感谢您可以提供的任何帮助。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
-module(mr).
-export([start/0, create/2, doreduce/2, domap/1, repeat/3]).

start()->
    [Num_arg|Repeat] = init:get_plain_arguments(),
    N = list_to_integer(Num_arg),
    [R_arg|_] = Repeat,
    R = list_to_integer(R_arg),
    create(R, N).

create(R, Num) when is_integer(Num), Num > 0 ->
    Reducers = [spawn(?MODULE, doreduce, [Index, self()]) || Index <- lists:seq(1, 2*Num - 1)],
    Mappers = [spawn(?MODULE, domap, [In]) || In <- lists:seq(1, Num)],
    reducer_connect(Num-1, Reducers, self()),
    mapper_connect(Num, Num, Reducers, Mappers),
    repeat(R, Num, Mappers).

repeat(0, Num, Mappers)->
    send_message(Num, Mappers),
    receive
        {result, V}->
            %io:format("Repeat: ~p ~p ~n", [0, V])
            true
    end;
repeat(R, Num, Mappers)->
    send_message(Num, Mappers),
    receive
        {result, V}->
            %io:format("Got: ~p ~p ~n", [R, V])
            true
    end,
    repeat(R-1, Num, Mappers).
send_message(1, Mappers)->
    D = lists:nth (1, Mappers),
    D ! {mapper, 1};
send_message(Num, Mappers)->
    D = lists:nth (Num, Mappers),
    D ! {mapper, Num},
    send_message(Num-1, Mappers).

reducer_connect(1, RList, Root)->
    Parent = lists:nth(1, RList),
    Child1 = lists:nth(2, RList),
    Child2 = lists:nth(3, RList),
    Child1 ! {connect, Parent},
    Child2 ! {connect, Parent},
    Parent !{connect, Root};      
reducer_connect(Index, RList, Root)->
    Parent = lists:nth(Index, RList),
    Child1 = lists:nth(Index*2, RList),
    Child2 = lists:nth(Index*2+1, RList),
    Child1 ! {connect, Parent},
    Child2 ! {connect, Parent},
    reducer_connect(Index-1, RList, Root).

mapper_connect(1, Num, RList, MList)->
    R = lists:nth(Num, RList),
    M = lists:nth(1, MList),
    M ! {connect, R};
mapper_connect(Index, Num, RList, MList) when is_integer(Index), Index > 0 ->
    R = lists:nth(Num + (Index-1), RList),
    M = lists:nth(Index, MList),
    M ! {connect, R},
    mapper_connect(Index-1, Num, RList, MList).  

doreduce(Index, CurId)->
    receive
        {connect, Parent}->
            doreduce(Index, Parent, 0, 0, CurId)
    end.
doreduce(Index, To, Val1, Val2, Root)->
    receive
        {map, Val} ->
            if Index rem 2 == 0 ->    
                To ! {reduce1, Val},
                doreduce(Index, To, 0, 0, Root);
            true->
                To ! {reduce2, Val},
                doreduce(Index, To, 0, 0, Root)
            end;
        {reduce1, V1} when Val2 > 0, Val1 == 0 ->
            if Index == 1 ->% root node
                Root !{result, Val2 + V1},
                doreduce(Index, To, 0, 0, Root);
            Index rem 2 == 0 ->    
                To ! {reduce1, V1+Val2},
                doreduce(Index, To, 0, 0, Root);
            true->
                To ! {reduce2, V1+Val2},
                doreduce(Index, To, 0, 0, Root)
            end;
        {reduce2, V2} when Val1 > 0, Val2 == 0 ->
            if Index == 1 ->% root node
                Root !{result, Val1 + V2},
                doreduce(Index, To, 0, 0, Root);
            Index rem 2 == 0 ->
                To ! {reduce1, V2+Val1},
                doreduce(Index, To, 0, 0, Root);
            true->
                To ! {reduce2, V2+Val1},
                doreduce(Index, To, 0, 0, Root)
            end;
        {reduce1, V1} when Val1 == 0, Val2 == 0 ->
            doreduce(Index, To, V1, 0, Root);
        {reduce2, V2} when Val1 == 0, Val2 == 0 ->
            doreduce(Index, To, 0, V2, Root);
        true->
            true
    end.

domap(Index)->
    receive
       {connect, ReduceId}->
            domap(Index, ReduceId)
    end.

domap(Index, To)->
    receive
        {mapper, V}->
            To !{map, V*V},
            domap(Index, To);
        true->
            true
    end.

尽管对于Erlang来说根本不是一件好事,但有一个非常简单的解决方案:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
-module(mr).

-export([start/1, start/2]).

start([R, N]) ->
    Result = start(list_to_integer(R), list_to_integer(N)),
    io:format("~B x ~B~n", [length(Result), hd(Result)]).

start(R, N) ->
    Self = self(),
    Reducer = start(Self, R, 1, N),
    [ receive {Reducer, Result} -> Result end || _ <- lists:seq(1, R) ].

start(Parent, R, N, N) ->
    spawn_link(fun() -> mapper(Parent, R, N) end);
start(Parent, R, From, To) ->
    spawn_link(fun() -> reducer(Parent, R, From, To) end).

mapper(Parent, R, N) ->
    [ Parent ! {self(), N*N}  || _ <- lists:seq(1, R) ].

reducer(Parent, R, From, To) ->
    Self = self(),
    Middle = ( From + To ) div 2,
    A = start(Self, R, From, Middle),
    B = start(Self, R, Middle + 1, To),
    [ Parent ! {Self, receive {A, X} -> receive {B, Y} -> X+Y end end}
      || _ <- lists:seq(1, R) ].

您可以使用

运行它

1
2
3
4
5
6
7
$ erlc -W mr.erl
$ time erl -noshell -run mr start 1024 1024 -s init stop
1024 x 358438400

real    0m2.162s
user    0m4.177s
sys     0m0.151s

但是大多数时间是虚拟机启动和宽容停止开销

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
$ time erl -noshell -run mr start 1024 1024 -s erlang halt
1024 x 358438400

real    0m1.172s
user    0m4.110s
sys     0m0.150s

$ erl
1> timer:tc(fun() -> mr:start(1024,1024) end).
{978453,
 [358438400,358438400,358438400,358438400,358438400,
  358438400,358438400,358438400,358438400,358438400,358438400,
  358438400,358438400,358438400,358438400,358438400,358438400,
  358438400,358438400,358438400,358438400,358438400,358438400,
  358438400,358438400,358438400,358438400|...]}

请记住,它更像是一种优雅的解决方案,而不是有效的解决方案。一种有效的解决方案应该在减少树分支与通信开销之间取得平衡。