How to implement the MapReduce example in Erlang efficiently?
我正在尝试比较并发编程语言(例如Haskell,Go和Erlang)的性能。下面的Go代码计算平方和(重复计算R次的平方和):
1^2+2^2+3^2....1024^2
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | package main import"fmt" func mapper(in chan int, out chan int) { for v := range in {out <- v*v} } func reducer(in1, in2 chan int, out chan int) { for i1 := range in1 {i2 := <- in2; out <- i1 + i2} } func main() { const N = 1024 // calculate sum of squares up to N; N must be power of 2 const R = 10 // number of repetitions to fill the"pipe" var r [N*2]chan int for i := range r {r[i] = make(chan int)} var m [N]chan int for i := range m {m[i] = make(chan int)} for i := 0; i < N; i++ {go mapper(m[i], r[i + N])} for i := 1; i < N; i++ {go reducer(r[i * 2], r[i *2 + 1], r[i])} go func () { for j := 0; j < R; j++ { for i := 0; i < N; i++ {m[i] <- i + 1} } } () for j := 0; j < R; j++ { <- r[1] } } |
以下代码是Erlang中的MapReduce解决方案。我是Erlang的新手。我想比较Go,Haskell和Erlang的性能。我的问题是如何优化此Erlang代码。我使用
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | -module(mr). -export([start/0, create/2, doreduce/2, domap/1, repeat/3]). start()-> [Num_arg|Repeat] = init:get_plain_arguments(), N = list_to_integer(Num_arg), [R_arg|_] = Repeat, R = list_to_integer(R_arg), create(R, N). create(R, Num) when is_integer(Num), Num > 0 -> Reducers = [spawn(?MODULE, doreduce, [Index, self()]) || Index <- lists:seq(1, 2*Num - 1)], Mappers = [spawn(?MODULE, domap, [In]) || In <- lists:seq(1, Num)], reducer_connect(Num-1, Reducers, self()), mapper_connect(Num, Num, Reducers, Mappers), repeat(R, Num, Mappers). repeat(0, Num, Mappers)-> send_message(Num, Mappers), receive {result, V}-> %io:format("Repeat: ~p ~p ~n", [0, V]) true end; repeat(R, Num, Mappers)-> send_message(Num, Mappers), receive {result, V}-> %io:format("Got: ~p ~p ~n", [R, V]) true end, repeat(R-1, Num, Mappers). send_message(1, Mappers)-> D = lists:nth (1, Mappers), D ! {mapper, 1}; send_message(Num, Mappers)-> D = lists:nth (Num, Mappers), D ! {mapper, Num}, send_message(Num-1, Mappers). reducer_connect(1, RList, Root)-> Parent = lists:nth(1, RList), Child1 = lists:nth(2, RList), Child2 = lists:nth(3, RList), Child1 ! {connect, Parent}, Child2 ! {connect, Parent}, Parent !{connect, Root}; reducer_connect(Index, RList, Root)-> Parent = lists:nth(Index, RList), Child1 = lists:nth(Index*2, RList), Child2 = lists:nth(Index*2+1, RList), Child1 ! {connect, Parent}, Child2 ! {connect, Parent}, reducer_connect(Index-1, RList, Root). mapper_connect(1, Num, RList, MList)-> R = lists:nth(Num, RList), M = lists:nth(1, MList), M ! {connect, R}; mapper_connect(Index, Num, RList, MList) when is_integer(Index), Index > 0 -> R = lists:nth(Num + (Index-1), RList), M = lists:nth(Index, MList), M ! {connect, R}, mapper_connect(Index-1, Num, RList, MList). doreduce(Index, CurId)-> receive {connect, Parent}-> doreduce(Index, Parent, 0, 0, CurId) end. doreduce(Index, To, Val1, Val2, Root)-> receive {map, Val} -> if Index rem 2 == 0 -> To ! {reduce1, Val}, doreduce(Index, To, 0, 0, Root); true-> To ! {reduce2, Val}, doreduce(Index, To, 0, 0, Root) end; {reduce1, V1} when Val2 > 0, Val1 == 0 -> if Index == 1 ->% root node Root !{result, Val2 + V1}, doreduce(Index, To, 0, 0, Root); Index rem 2 == 0 -> To ! {reduce1, V1+Val2}, doreduce(Index, To, 0, 0, Root); true-> To ! {reduce2, V1+Val2}, doreduce(Index, To, 0, 0, Root) end; {reduce2, V2} when Val1 > 0, Val2 == 0 -> if Index == 1 ->% root node Root !{result, Val1 + V2}, doreduce(Index, To, 0, 0, Root); Index rem 2 == 0 -> To ! {reduce1, V2+Val1}, doreduce(Index, To, 0, 0, Root); true-> To ! {reduce2, V2+Val1}, doreduce(Index, To, 0, 0, Root) end; {reduce1, V1} when Val1 == 0, Val2 == 0 -> doreduce(Index, To, V1, 0, Root); {reduce2, V2} when Val1 == 0, Val2 == 0 -> doreduce(Index, To, 0, V2, Root); true-> true end. domap(Index)-> receive {connect, ReduceId}-> domap(Index, ReduceId) end. domap(Index, To)-> receive {mapper, V}-> To !{map, V*V}, domap(Index, To); true-> true end. |
尽管对于Erlang来说根本不是一件好事,但有一个非常简单的解决方案:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | -module(mr). -export([start/1, start/2]). start([R, N]) -> Result = start(list_to_integer(R), list_to_integer(N)), io:format("~B x ~B~n", [length(Result), hd(Result)]). start(R, N) -> Self = self(), Reducer = start(Self, R, 1, N), [ receive {Reducer, Result} -> Result end || _ <- lists:seq(1, R) ]. start(Parent, R, N, N) -> spawn_link(fun() -> mapper(Parent, R, N) end); start(Parent, R, From, To) -> spawn_link(fun() -> reducer(Parent, R, From, To) end). mapper(Parent, R, N) -> [ Parent ! {self(), N*N} || _ <- lists:seq(1, R) ]. reducer(Parent, R, From, To) -> Self = self(), Middle = ( From + To ) div 2, A = start(Self, R, From, Middle), B = start(Self, R, Middle + 1, To), [ Parent ! {Self, receive {A, X} -> receive {B, Y} -> X+Y end end} || _ <- lists:seq(1, R) ]. |
您可以使用
运行它
1 2 3 4 5 6 7 | $ erlc -W mr.erl $ time erl -noshell -run mr start 1024 1024 -s init stop 1024 x 358438400 real 0m2.162s user 0m4.177s sys 0m0.151s |
但是大多数时间是虚拟机启动和宽容停止开销
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | $ time erl -noshell -run mr start 1024 1024 -s erlang halt 1024 x 358438400 real 0m1.172s user 0m4.110s sys 0m0.150s $ erl 1> timer:tc(fun() -> mr:start(1024,1024) end). {978453, [358438400,358438400,358438400,358438400,358438400, 358438400,358438400,358438400,358438400,358438400,358438400, 358438400,358438400,358438400,358438400,358438400,358438400, 358438400,358438400,358438400,358438400,358438400,358438400, 358438400,358438400,358438400,358438400|...]} |
请记住,它更像是一种优雅的解决方案,而不是有效的解决方案。一种有效的解决方案应该在减少树分支与通信开销之间取得平衡。