Python提高了random.sample的性能

Python increase performance of random.sample

我正在编写一个函数来随机选择存储在字典中的元素：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

import random
from liblas import file as lasfile
from collections import defaultdict

def point_random_selection(list,k):
try:
sample_point = random.sample(list,k)
except ValueError:
sample_point = list
return(sample_point)

def world2Pixel_Id(x,y,X_Min,Y_Max,xDist,yDist):
col = int((x - X_Min)/xDist)
row = int((Y_Max - y)/yDist)
return("{0}_{1}".format(col,row))

def point_GridGroups(inFile,X_Min,Y_Max,xDist,yDist):
Groups = defaultdict(list)
for p in lasfile.File(inFile,None,'r'):
id = world2Pixel_Id(p.x,p.y,X_Min,Y_Max,xDist,yDist)
Groups[id].append(p)
return(Groups)

其中k是要选择的元素数。组是字典

1
2
3
4
5
6
7
8

file_out = lasfile.File("outPut",mode='w',header= h)
for m in Groups.iteritems():
# select k point for each dictionary key
point_selected = point_random_selection(m[1],k)
for l in xrange(len(point_selected)):
# save the data
file_out.write(point_selected[l])
file_out.close()

我的问题是，这种方法非常慢(对于4天约800 Mb的文件)

相关讨论

您可以在读取坐标时尝试更新样本。这至少使您不必在运行样品之前将所有内容存储在内存中。这不能保证使事情更快。

以下内容基于BlkKnght的出色答案，即根据文件输入构建随机样本而不保留所有行。这只是扩展了它以保留多个样本。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30

import random
from liblas import file as lasfile
from collections import defaultdict

def world2Pixel_Id(x, y, X_Min, Y_Max, xDist, yDist):
col = int((x - X_Min) / xDist)
row = int((Y_Max - y) / yDist)
return (col, row)

def random_grouped_samples(infile, n, X_Min, Y_Max, xDist, yDist):
"""Select up to n points *per group* from infile"""

groupcounts = defaultdict(int)
samples = defaultdict(list)

for p in lasfile.File(inFile, None, 'r'):
id = world2Pixel_Id(p.x, p.y, X_Min, Y_Max, xDist, yDist)
i = groupcounts[id]
r = random.randint(0, i)

if r < n:
if i < n:
samples[id].insert(r, p) # add first n items in random order
else:
samples[id][r] = p # at a decreasing rate, replace random items

groupcounts[id] += 1

return samples

上面的函数采用inFile和您的边界坐标以及样本大小n，并返回在每个组中最多包含n个项目的分组样本，这些样本均是统一选择的。

因为您将id用作组密钥，所以将其简化为仅计算col, row元组，因此无需将其设为字符串。

您可以使用以下命令将它们写到文件中：

1
2
3
4
5
6
7

file_out = lasfile.File("outPut",mode='w',header= h)

for group in samples.itervalues():
for p in group:
file_out.write(p)

file_out.close()