Avoid deepcopy due to performance
我有一个列表,其中还有3个列表。 我需要对列表进行一些操作,因为我有180.000的深度复制步骤已经花费了2.5s来复制一次列表。
220秒的计算时间中,包括操作在内的总时间花费了80秒。
1 2 3 4 5 6 | s = [[1,2,3], [4,5,6], [7,8,9]] s1 = copy.deepcopy(s) s1[0][0] = s[1][1] s1[1][1] = s[0][0] |
所示的操作需要重复一百万次。 因此,深度复制使我面临性能瓶颈。
有没有更高性能的方法或"取消引用"列表
1 2 3 4 5 6 7 | In [9]: s = [[random.randint(1, 1000) for _ in range(100)] for _ in range(100)] In [10]: %timeit copy.deepcopy(s) 10 loops, best of 3: 22.7 ms per loop In [11]: %timeit [list(x) for x in s] 10000 loops, best of 3: 123 μs per loop In [18]: %timeit list(map(list, s)) 10000 loops, best of 3: 111 μs per loop |
或者,根据您的应用程序,最好不要复制并存储(已修改的)列表本身,而只是以修改后的单元格形式或作为命令堆栈的形式进行修改和存储。
好的,所以copy.deepcopy是在python中实现的,我对您的问题做了一个小的测试脚本:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | # deepcopy.py import copy import hotshot, hotshot.stats import line_profiler def prof(): all = [] for _ in range(180000): all.append([ [1,2,3], [1,2,3], [1,2,3], ]) prof = hotshot.Profile("deepcopy.py.prof") prof.start() copy.deepcopy(all) prof.stop() prof.close() stats = hotshot.stats.load('deepcopy.py.prof') stats.strip_dirs() stats.sort_stats('time', 'calls') stats.print_stats(20) def lineprof(): all = [] for _ in range(180000): all.append([ [1,2,3], [1,2,3], [1,2,3], ]) prof = line_profiler.LineProfiler() prof.add_function(copy.deepcopy) prof.enable() copy.deepcopy(all) prof.disable() prof.print_stats() prof() lineprof() |
首先,我使用hotshot来了解正在发生的事情,似乎没有比deepcopy本身更多的东西,所以我添加了行配置文件,结果如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | 3780009 function calls (720009 primitive calls) in 3.156 seconds Ordered by: internal time, call count ncalls tottime percall cumtime percall filename:lineno(function) 720001/1 1.483 0.000 3.156 3.156 copy.py:226(_deepcopy_list) 2340001/1 1.425 0.000 3.156 3.156 copy.py:145(deepcopy) 720004 0.248 0.000 0.248 0.000 copy.py:267(_keep_alive) 3 0.000 0.000 0.000 0.000 copy.py:198(_deepcopy_atomic) 0 0.000 0.000 profile:0(profiler) Timer unit: 1e-06 s Total time: 13.6727 s File: /usr/lib64/python2.7/copy.py Function: deepcopy at line 145 Line # Hits Time Per Hit % Time Line Contents ============================================================== 145 def deepcopy(x, memo=None, _nil=[]): 146 """Deep copy operation on arbitrary Python objects. 147 148 See the module's __doc__ string for more info. 149 """ 150 151 2340001 1614687 0.7 11.8 if memo is None: 152 1 1 1.0 0.0 memo = {} 153 154 2340001 1725759 0.7 12.6 d = id(x) 155 2340001 2123792 0.9 15.5 y = memo.get(d, _nil) 156 2340001 1550940 0.7 11.3 if y is not _nil: 157 1619997 1047121 0.6 7.7 return y 158 159 720004 587838 0.8 4.3 cls = type(x) 160 161 720004 577271 0.8 4.2 copier = _deepcopy_dispatch.get(cls) 162 720004 476286 0.7 3.5 if copier: 163 720004 1813706 2.5 13.3 y = copier(x, memo) 164 else: 165 try: 166 issc = issubclass(cls, type) 167 except TypeError: # cls is not a class (old Boost; see SF #502085) 168 issc = 0 169 if issc: 170 y = _deepcopy_atomic(x, memo) 171 else: 172 copier = getattr(x,"__deepcopy__", None) 173 if copier: 174 y = copier(memo) 175 else: 176 reductor = dispatch_table.get(cls) 177 if reductor: 178 rv = reductor(x) 179 else: 180 reductor = getattr(x,"__reduce_ex__", None) 181 if reductor: 182 rv = reductor(2) 183 else: 184 reductor = getattr(x,"__reduce__", None) 185 if reductor: 186 rv = reductor() 187 else: 188 raise Error( 189 "un(deep)copyable object of type %s" % cls) 190 y = _reconstruct(x, rv, 1, memo) 191 192 720004 579181 0.8 4.2 memo[d] = y 193 720004 1115258 1.5 8.2 _keep_alive(x, memo) # Make sure x lives at least as long as d 194 720004 460834 0.6 3.4 return y |
因此,问题似乎在于,备忘无济于事,并且浪费了更多的时间在簿记而不是应对上,所以我的建议是利用列表定义明确的事实,然后自己进行复制:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | import hotshot, hotshot.stats import line_profiler def manualcopy(original_list): copy = [] for item in original_list: copy.append( [ list(item[0]), list(item[1]), list(item[2]), ] ) return copy def prof(): all = [] for _ in range(180000): all.append([ [1,2,3], [1,2,3], [1,2,3], ]) prof = hotshot.Profile("deepcopy.py.prof") prof.start() manualcopy(all) prof.stop() prof.close() stats = hotshot.stats.load('deepcopy.py.prof') stats.strip_dirs() stats.sort_stats('time', 'calls') stats.print_stats(20) def lineprof(): all = [] for _ in range(180000): all.append([ [1,2,3], [1,2,3], [1,2,3], ]) prof = line_profiler.LineProfiler() prof.add_function(manualcopy) prof.enable() manualcopy(all) prof.disable() prof.print_stats() prof() lineprof() |
那将从3.156秒减少到0.446秒
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | 1 function calls in 0.446 seconds Ordered by: internal time, call count ncalls tottime percall cumtime percall filename:lineno(function) 1 0.446 0.446 0.446 0.446 manualcopy.py:5(manualcopy) 0 0.000 0.000 profile:0(profiler) Timer unit: 1e-06 s Total time: 0.762817 s File: manualcopy.py Function: manualcopy at line 5 Line # Hits Time Per Hit % Time Line Contents ============================================================== 5 def manualcopy(original_list): 6 1 3 3.0 0.0 copy = [] 7 180001 62622 0.3 8.2 for item in original_list: 8 180000 66805 0.4 8.8 copy.append( 9 [ 10 180000 106683 0.6 14.0 list(item[0]), 11 180000 137308 0.8 18.0 list(item[1]), 12 180000 389396 2.2 51.0 list(item[2]), 13 ] 14 ) 15 1 0 0.0 0.0 return copy |