python 实现SOM：代码注释与应用示例

SOM原理介绍可参考：https://zhuanlan.zhihu.com/p/73534694

代码来源：https://github.com/wzg16/minisom

可以直接在环境中安装：

1	pip install minisom

或者下载代码后安装

1 2	git clone https://github.com/JustGlowing/minisom.git python setup.py install

以下附录内容来自知乎： https://zhuanlan.zhihu.com/p/73534694

源码解读注释如下

minisom.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812

from math import sqrt

from numpy import (array, unravel_index, nditer, linalg, random, subtract, max,
power, exp, pi, zeros, ones, arange, outer, meshgrid, dot,
logical_and, mean, std, cov, argsort, linspace, transpose,
einsum, prod, nan, sqrt, hstack, diff, argmin, multiply)
from numpy import sum as npsum
from numpy.linalg import norm
from collections import defaultdict, Counter
from warnings import warn
from sys import stdout
from time import time
from datetime import timedelta
import pickle
import os

# for unit tests
from numpy.testing import assert_almost_equal, assert_array_almost_equal
from numpy.testing import assert_array_equal
import unittest

"""
Minimalistic implementation of the Self Organizing Maps (SOM).
"""

def _build_iteration_indexes(data_len, num_iterations,
verbose=False, random_generator=None):
"""Returns an iterable with the indexes of the samples
to pick at each iteration of the training.
If random_generator is not None, it must be an instance
of numpy.random.RandomState and it will be used
to randomize the order of the samples."""
iterations = arange(num_iterations) % data_len
if random_generator:
random_generator.shuffle(iterations)
if verbose:
return _wrap_index__in_verbose(iterations)
else:
return iterations

def _wrap_index__in_verbose(iterations):
"""Yields the values in iterations printing the status on the stdout."""
m = len(iterations)
digits = len(str(m))
progress = '\r [ {s:{d}} / {m} ] {s:3.0f}% - ? it/s'
progress = progress.format(m=m, d=digits, s=0)
stdout.write(progress)
beginning = time()
stdout.write(progress)
for i, it in enumerate(iterations):
yield it
sec_left = ((m-i+1) * (time() - beginning)) / (i+1)
time_left = str(timedelta(seconds=sec_left))[:7]
progress = '\r [ {i:{d}} / {m} ]'.format(i=i+1, d=digits, m=m)
progress += ' {p:3.0f}%'.format(p=100*(i+1)/m)
progress += ' - {time_left} left '.format(time_left=time_left)
stdout.write(progress)

def fast_norm(x):
"""
快速计算向量的二范数，速度比linalg.norm快。
Returns norm-2 of a 1-D numpy array.
* faster than linalg.norm in case of 1-D arrays (numpy 1.9.2rc1).
"""
return sqrt(dot(x, x.T))

def asymptotic_decay(learning_rate, t, max_iter):
"""Decay function of the learning process.
Parameters
----------
learning_rate : float
current learning rate.
t : int
current iteration.
max_iter : int
maximum number of iterations for the training.
"""
return learning_rate / (1+t/(max_iter/2))

class MiniSom(object):
def __init__(self, x, y, input_len, sigma=1.0, learning_rate=0.5,
decay_function=asymptotic_decay,
neighborhood_function='gaussian', topology='rectangular',
activation_distance='euclidean', random_seed=None):
"""Initializes a Self Organizing Maps.
A rule of thumb to set the size of the grid for a dimensionality
reduction task is that it should contain 5*sqrt(N) neurons
where N is the number of samples in the dataset to analyze.
E.g. if your dataset has 150 samples, 5*sqrt(150) = 61.23
hence a map 8-by-8 should perform well.
Parameters
----------
x : int
x dimension of the SOM.
y : int
y dimension of the SOM.
input_len : int
Number of the elements of the vectors in input.
sigma : float, optional (default=1.0)
Spread of the neighborhood function, needs to be adequate
to the dimensions of the map.
(at the iteration t we have sigma(t) = sigma / (1 + t/T)
where T is #num_iteration/2)
learning_rate : initial learning rate
(at the iteration t we have
learning_rate(t) = learning_rate / (1 + t/T)
where T is #num_iteration/2)
decay_function : function (default=None)
Function that reduces learning_rate and sigma at each iteration
the default function is:
learning_rate / (1+t/(max_iterarations/2))
A custom decay function will need to to take in input
three parameters in the following order:
1. learning rate
2. current iteration
3. maximum number of iterations allowed
Note that if a lambda function is used to define the decay
MiniSom will not be pickable anymore.
neighborhood_function : string, optional (default='gaussian')
Function that weights the neighborhood of a position in the map.
Possible values: 'gaussian', 'mexican_hat', 'bubble', 'triangle'
topology : string, optional (default='rectangular')
Topology of the map.
Possible values: 'rectangular', 'hexagonal'
activation_distance : string, optional (default='euclidean')
Distance used to activate the map.
Possible values: 'euclidean', 'cosine', 'manhattan', 'chebyshev'
random_seed : int, optional (default=None)
Random seed to use.
"""
if sigma >= x or sigma >= y:
warn('Warning: sigma is too high for the dimension of the map.')

self._random_generator = random.RandomState(random_seed)

self._learning_rate = learning_rate
self._sigma = sigma #
self._input_len = input_len
# random initialization
self._weights = self._random_generator.rand(x, y, input_len)*2-1 # 网络权重定义与随机初始化
self._weights /= linalg.norm(self._weights, axis=-1, keepdims=True)

self._activation_map = zeros((x, y))
self._neigx = arange(x) # 近邻x,是一个列表
self._neigy = arange(y) # used to evaluate the neighborhood function

if topology not in ['hexagonal', 'rectangular']:
msg = '%s not supported only hexagonal and rectangular available'
raise ValueError(msg % topology)
self.topology = topology
self._xx, self._yy = meshgrid(self._neigx, self._neigy)
self._xx = self._xx.astype(float)
self._yy = self._yy.astype(float)
if topology == 'hexagonal':
self._xx[::-2] -= 0.5
if neighborhood_function in ['triangle']:
warn('triangle neighborhood function does not ' +
'take in account hexagonal topology')

self._decay_function = decay_function

neig_functions = {'gaussian': self._gaussian,
'mexican_hat': self._mexican_hat,
'bubble': self._bubble,
'triangle': self._triangle}

if neighborhood_function not in neig_functions:
msg = '%s not supported. Functions available: %s'
raise ValueError(msg % (neighborhood_function,
', '.join(neig_functions.keys())))

if neighborhood_function in ['triangle',
'bubble'] and (divmod(sigma, 1)[1] != 0
or sigma < 1):
warn('sigma should be an integer >=1 when triangle or bubble' +
'are used as neighborhood function')

self.neighborhood = neig_functions[neighborhood_function]

distance_functions = {'euclidean': self._euclidean_distance,
'cosine': self._cosine_distance,
'manhattan': self._manhattan_distance,
'chebyshev': self._chebyshev_distance}

if activation_distance not in distance_functions:
msg = '%s not supported. Distances available: %s'
raise ValueError(msg % (activation_distance,
', '.join(distance_functions.keys())))

self._activation_distance = distance_functions[activation_distance]# 选择不同的距离计算方式，计算x到w的距离

def get_weights(self):
"""
获取神经元权重
Returns the weights of the neural network.
"""
return self._weights

def get_euclidean_coordinates(self):
"""Returns the position of the neurons on an euclidean
plane that reflects the chosen topology in two meshgrids xx and yy.
Neuron with map coordinates (1, 4) has coordinate (xx[1, 4], yy[1, 4])
in the euclidean plane.
Only useful if the topology chosen is not rectangular.
获取欧几里得平面上，获取神经元的坐标，当拓扑结构为非矩形时才有用
"""
return self._xx.T, self._yy.T

def convert_map_to_euclidean(self, xy):
"""Converts map coordinates into euclidean coordinates
that reflects the chosen topology.
Only useful if the topology chosen is not rectangular.
"""
return self._xx.T[xy], self._yy.T[xy]

def _activate(self, x):
"""
功能：计算输入样本到所有竞争层神经元的距离，返回一个矩阵。是一个距离矩阵。
并把距离矩阵存储到self._activation_map中
Updates matrix activation_map, in this matrix
the element i,j is the response of the neuron i,j to x.
"""
self._activation_map = self._activation_distance(x, self._weights)

def activate(self, x):
"""
返回x到所有神经元的距离矩阵
Returns the activation map to x.
"""
self._activate(x)
return self._activation_map

def _gaussian(self, c, sigma):
"""Returns a Gaussian centered in c.
定义领域"""
d = 2*pi*sigma*sigma # 高斯函数的方差，当方差为0时
ax = exp(-power(self._xx-self._xx.T[c], 2)/d)
ay = exp(-power(self._yy-self._yy.T[c], 2)/d)
return (ax * ay).T # the external product gives a matrix 返回一个外积矩阵

def _mexican_hat(self, c, sigma):
"""Mexican hat centered in c."""
p = power(self._xx-self._xx.T[c], 2) + power(self._yy-self._yy.T[c], 2)
d = 2*pi*sigma*sigma
return (exp(-p/d)*(1-2/d*p)).T

def _bubble(self, c, sigma):
"""Constant function centered in c with spread sigma.
sigma should be an odd value.
"""
ax = logical_and(self._neigx > c[0]-sigma,
self._neigx < c[0]+sigma)
ay = logical_and(self._neigy > c[1]-sigma,
self._neigy < c[1]+sigma)
return outer(ax, ay)*1.

def _triangle(self, c, sigma):
"""Triangular function centered in c with spread sigma."""
triangle_x = (-abs(c[0] - self._neigx)) + sigma
triangle_y = (-abs(c[1] - self._neigy)) + sigma
triangle_x[triangle_x < 0] = 0.
triangle_y[triangle_y < 0] = 0.
return outer(triangle_x, triangle_y)

def _cosine_distance(self, x, w):
num = (w * x).sum(axis=2)
denum = multiply(linalg.norm(w, axis=2), linalg.norm(x))
return 1 - num / (denum+1e-8)

def _euclidean_distance(self, x, w):
"""
计算一个样本到一个神经元的距离，返回的是一个标量，欧式距离
:param x:
:param w:
:return:
"""
return linalg.norm(subtract(x, w), axis=-1)
# subtract是减法，x-w。np.linalg.norm(求范数)axis=1表示按行向量处理，求多个行向量的范数，axis=0表示按列向量处理，求多个列向量的范数

def _manhattan_distance(self, x, w):
return linalg.norm(subtract(x, w), ord=1, axis=-1)

def _chebyshev_distance(self, x, w):
return max(subtract(x, w), axis=-1)

def _check_iteration_number(self, num_iteration):
if num_iteration < 1:
raise ValueError('num_iteration must be > 1')

def _check_input_len(self, data):
"""Checks that the data in input is of the correct shape."""
data_len = len(data[0])
if self._input_len != data_len:
msg = 'Received %d features, expected %d.' % (data_len,
self._input_len)
raise ValueError(msg)

def winner(self, x):
"""
获胜者：计算输入样本x的获胜神经元的坐标
Computes the coordinates of the winning neuron for the sample x.
"""
self._activate(x)
return unravel_index(self._activation_map.argmin(),
self._activation_map.shape)

def update(self, x, win, t, max_iteration):
"""
神经元的权重更新：
第一步：按照当前的迭代次数，计算当前的学习率和扩展范围
第二步：计算每个激活神经元的学习率
第三步：按照更新规则更新权重
Updates the weights of the neurons.更新神经元的权重
Parameters
----------
x : np.array 当前输入向量
Current pattern to learn.
win : tuple 获胜的神经元的列表
Position of the winning neuron for x (array or tuple).
t : int 当前的迭代次数，用于计算学习率，激活区域大小
Iteration index
max_iteration : int 最大的迭代次数，用于计算学习率，激活区域大小
Maximum number of training itarations.
"""
eta = self._decay_function(self._learning_rate, t, max_iteration) # 更新学习率
# sigma and learning rate decrease with the same rule
sig = self._decay_function(self._sigma, t, max_iteration) # 更新激活范围
# improves the performances
g = self.neighborhood(win, sig)*eta # 根据获胜神经元，获取其邻域，计算邻域的学习率
# w_new = eta * neighborhood_function * (x-w)
self._weights += einsum('ij, ijk->ijk', g, x-self._weights)

def quantization(self, data):
"""
量化：返回距离数据集最近的神经元的权重，并把这个权重重复，分配给每个样本。
Assigns a code book (weights vector of the winning neuron)
to each sample in data.
"""
self._check_input_len(data)
# 所有样本与所有神经元之间的所有距离的最小值
winners_coords = argmin(self._distance_from_weights(data), axis=1) # 获取获胜神经元的坐标
print(winners_coords)
return self._weights[unravel_index(winners_coords,
self._weights.shape[:2])]

def random_weights_init(self, data):
"""
随机权重初始化：利用数据集中的样本随机初始化权重
Initializes the weights of the SOM
picking random samples from data.
"""
self._check_input_len(data)
it = nditer(self._activation_map, flags=['multi_index'])
while not it.finished:
rand_i = self._random_generator.randint(len(data))
self._weights[it.multi_index] = data[rand_i]
it.iternext()

def pca_weights_init(self, data):
"""
PCA方法初始化权重：先计算输入样本的两个PCA主分量，用两个主分量的加权和初始化网络权重。
该初始化方法使训练过程收敛更快
强烈推荐在初始化权重前将样本归一化，并且在训练过程中使用相同的归一化。
Initializes the weights to span the first two principal components.
This initialization doesn't depend on random processes and
makes the training process converge faster.
It is strongly reccomended to normalize the data before initializing
the weights and use the same normalization for the training data.
"""
if self._input_len == 1:
msg = 'The data needs at least 2 features for pca initialization'
raise ValueError(msg)
self._check_input_len(data)
if len(self._neigx) == 1 or len(self._neigy) == 1:
msg = 'PCA initialization inappropriate:' + \
'One of the dimensions of the map is 1.'
warn(msg)
pc_length, pc = linalg.eig(cov(transpose(data))) # np.cov计算协方差，np.linalg.eig计算矩阵的特征向量
pc_order = argsort(-pc_length)# 特征值排序，argsort函数返回的是数组值从小到大的索引值
for i, c1 in enumerate(linspace(-1, 1, len(self._neigx))):
for j, c2 in enumerate(linspace(-1, 1, len(self._neigy))):
self._weights[i, j] = c1*pc[pc_order[0]] + c2*pc[pc_order[1]]
# 上面：pc[pc_order[0]] 表示最大特征值对应的特征向量
# pc[pc_order[1]] 表示第二大特征值对应的特征向量
# c1*pc[pc_order[0]] + c2*pc[pc_order[1]] 表示两个特征向量的加权和

def train(self, data, num_iteration, random_order=False, verbose=False):
"""Trains the SOM.
Parameters
----------
data : np.array or list
Data matrix.
num_iteration : int
Maximum number of iterations (one iteration per sample).
random_order : bool (default=False)用以决定是否从数据集中随机采集样本训练
If True, samples are picked in random order.
Otherwise the samples are picked sequentially.
verbose : bool (default=False) 用以决定是否打印迭代信息
If True the status of the training
will be printed at each iteration.
"""
self._check_iteration_number(num_iteration)
self._check_input_len(data)
random_generator = None
if random_order:
random_generator = self._random_generator
iterations = _build_iteration_indexes(len(data), num_iteration,
verbose, random_generator)
for t, iteration in enumerate(iterations):
self.update(data[iteration], self.winner(data[iteration]),
t, num_iteration)
if verbose:
print('\n quantization error:', self.quantization_error(data))

def train_random(self, data, num_iteration, verbose=False):
"""Trains the SOM picking samples at random from data.
Parameters
----------
data : np.array or list
Data matrix.
num_iteration : int
Maximum number of iterations (one iteration per sample).
verbose : bool (default=False)
If True the status of the training
will be printed at each iteration.
"""
self.train(data, num_iteration, random_order=True, verbose=verbose)

def train_batch(self, data, num_iteration, verbose=False):
"""
按数据集的样本顺序训练SOM
Trains the SOM using all the vectors in data sequentially.
Parameters
----------
data : np.array or list
Data matrix.
num_iteration : int
Maximum number of iterations (one iteration per sample).
verbose : bool (default=False)
If True the status of the training
will be printed at each iteration.
"""
self.train(data, num_iteration, random_order=False, verbose=verbose)

def distance_map(self):
"""
权重的距离地图
每个元素是一个神经元与它近邻之间的距离之和。
Returns the distance map of the weights.
Each cell is the normalised sum of the distances between
a neuron and its neighbours. Note that this method uses
the euclidean distance.
"""
um = zeros((self._weights.shape[0],
self._weights.shape[1],
8)) # 2 spots more for hexagonal topology 对于六边形来说，有两个是多余的
# 定义了一个矩形的八个邻域以当前区域坐标为[0,0]
ii = [[0, -1, -1, -1, 0, 1, 1, 1]]*2
jj = [[-1, -1, 0, 1, 1, 1, 0, -1]]*2

if self.topology == 'hexagonal':# 六边形拓扑结构
ii = [[1, 1, 1, 0, -1, 0], [0, 1, 0, -1, -1, -1]]
jj = [[1, 0, -1, -1, 0, 1], [1, 0, -1, -1, 0, 1]]

# 遍历每个神经元
for x in range(self._weights.shape[0]):
for y in range(self._weights.shape[1]):
w_2 = self._weights[x, y] # 提取神经元的权重
e = y % 2 == 0 # only used on hexagonal topology # 仅仅在六边形中应用
# 对于该神经元的第k个邻域，坐标为[i,j]
for k, (i, j) in enumerate(zip(ii[e], jj[e])):
# 如果这个邻域存在/没有超出som竞争层的边界
if (x+i >= 0 and x+i < self._weights.shape[0] and
y+j >= 0 and y+j < self._weights.shape[1]):
w_1 = self._weights[x+i, y+j] # 获取这个邻域的权重
um[x, y, k] = fast_norm(w_2-w_1) # 并计算这两个神经元之间的欧式距离，

um = um.sum(axis=2) # 将每个神经元与其相邻神经元之间的距离求累加和
return um/um.max() # 利用最大距离归一化

def activation_response(self, data):
"""
记录神经元被激活的次数
激活响应：返回一个矩阵，矩阵的第[i,j]个元素，反应第[i,j]个神经元在数据集data中被激活的次数
Returns a matrix where the element i,j is the number of times
that the neuron i,j have been winner.
"""
self._check_input_len(data)
a = zeros((self._weights.shape[0], self._weights.shape[1]))
for x in data:
a[self.winner(x)] += 1
return a

def _distance_from_weights(self, data):
"""
返回一个矩阵：反应第i个样本与第j个神经元之间的距离（欧式距离）
Returns a matrix d where d[i,j] is the euclidean distance between
data[i] and the j-th weight.
"""
input_data = array(data)
weights_flat = self._weights.reshape(-1, self._weights.shape[2])# 把神经元的拓扑结构拉直
input_data_sq = power(input_data, 2).sum(axis=1, keepdims=True)
weights_flat_sq = power(weights_flat, 2).sum(axis=1, keepdims=True)
cross_term = dot(input_data, weights_flat.T)
return sqrt(-2 * cross_term + input_data_sq + weights_flat_sq.T)

def quantization_error(self, data):
"""
量化误差：计算数据集中的样本到最佳匹配神经元的距离的均值。
最佳匹配神经元：到数据集最近的神经元
Returns the quantization error computed as the average
distance between each input sample and its best matching unit.
"""
self._check_input_len(data)
return norm(data-self.quantization(data), axis=1).mean()

def topographic_error(self, data):
"""
拓扑结构误差：对每个输入样本，找到最佳匹配神经元和第二最佳匹配神经元，评估他们的位置。
当二者的位置不相邻时，把该样本统计为一个误差。
拓扑结构误差 = 误差样本总数/所有样本总数
如果拓扑结构误差=1，所有样本的拓扑结构都没有被保存。
Returns the topographic error computed by finding
the best-matching and second-best-matching neuron in the map
for each input and then evaluating the positions.
A sample for which these two nodes are not adjacent counts as
an error. The topographic error is given by the
the total number of errors divided by the total of samples.
If the topographic error is 0, no error occurred.
If 1, the topology was not preserved for any of the samples.
"""
self._check_input_len(data)
if self.topology == 'hexagonal':
msg = 'Topographic error not implemented for hexagonal topology.'
raise NotImplementedError(msg)
total_neurons = prod(self._activation_map.shape) # 神经元的总个数
if total_neurons == 1:
warn('The topographic error is not defined for a 1-by-1 map.')
return nan

t = 1.42
# b2mu: best 2 matching units
b2mu_inds = argsort(self._distance_from_weights(data), axis=1)[:, :2]# 最大的两个神经元的索引，一维坐标
b2my_xy = unravel_index(b2mu_inds, self._weights.shape[:2])# 最大的两个神经元的索引，二维坐标
b2mu_x, b2mu_y = b2my_xy[0], b2my_xy[1] # b2mu_x=（x1,x2）,b2mu_y=（y1,y2）
dxdy = hstack([diff(b2mu_x), diff(b2mu_y)]) # dxdy = [dx,dy]
distance = norm(dxdy, axis=1) # 计算两个神经元的欧式距离
return (distance > t).mean() #统计距离超过1.42的样本的个数，并除以样本总个数

def win_map(self, data, return_indices=False):
"""
赢者图：统计每个神经元收集到的样本。
如果 return_indices=True, 那么统计每个神经元收集到的样本在数据集中的索引。
返回一个字典：
字典的索引是竞争神经元的坐标
字典的内容是该神经元收集到的数据集样本。
Returns a dictionary wm where wm[(i,j)] is a list with:
- all the patterns that have been mapped to the position (i,j),
if return_indices=False (default)
- all indices of the elements that have been mapped to the
position (i,j) if return_indices=True
"""
self._check_input_len(data)
winmap = defaultdict(list)
for i, x in enumerate(data):
winmap[self.winner(x)].append(i if return_indices else x)
return winmap

def labels_map(self, data, labels):
"""
标签图：统计每个神经元收集到的样本的标签种类，及每个标签的出现频次
返回一个双层字典wm. wm是一个字典，wm的元素wm[(i,j)]也是一个字典
wm的索引是神经元的坐标
wm[(i,j)]的索引是样本的标签，内容是该标签出现的频次/该标签被映射到这个神经元的次数
Returns a dictionary wm where wm[(i,j)] is a dictionary
that contains the number of samples from a given label
that have been mapped in position i,j.
----------
data : np.array or list
Data matrix.
label : np.array or list
Labels for each sample in data.
"""
self._check_input_len(data)
if not len(data) == len(labels):
raise ValueError('data and labels must have the same length.')
winmap = defaultdict(list) # 是一个字典
for x, l in zip(data, labels):
winmap[self.winner(x)].append(l) # 每一个神经元是一个字典条目，name=神经元的索引，内容是这个神经元下的不同标签，是一个列表：归类到这个神经元下的样本的标签
for position in winmap: # 遍历每一个神经元，统计分配到这个神经元下样本的个数
winmap[position] = Counter(winmap[position]) # counter 输入一个列表，返回一个字典，字典的内容是：在列表中每个元素出现的次数
return winmap # 是一个双层字典，存储每个神经元下收集到的不同样本的标签，并统计每个标签出现的频次

class TestMinisom(unittest.TestCase):
"""
测试类，测试MiniSom的每个函数

"""
def setUp(self):
self.som = MiniSom(5, 5, 1)
for i in range(5):
for j in range(5):
# checking weights normalization
assert_almost_equal(1.0, linalg.norm(self.som._weights[i, j]))
self.som._weights = zeros((5, 5, 1)) # fake weights
self.som._weights[2, 3] = 5.0
self.som._weights[1, 1] = 2.0

def test_decay_function(self):
assert self.som._decay_function(1., 2., 3.) == 1./(1.+2./(3./2))

def test_fast_norm(self):
assert fast_norm(array([1, 3])) == sqrt(1+9)

def test_euclidean_distance(self):
x = zeros((1, 2))
w = ones((2, 2, 2))
d = self.som._euclidean_distance(x, w)
assert_array_almost_equal(d, [[1.41421356, 1.41421356],
[1.41421356, 1.41421356]])

def test_cosine_distance(self):
x = zeros((1, 2))
w = ones((2, 2, 2))
d = self.som._cosine_distance(x, w)
assert_array_almost_equal(d, [[1., 1.],
[1., 1.]])

def test_manhattan_distance(self):
x = zeros((1, 2))
w = ones((2, 2, 2))
d = self.som._manhattan_distance(x, w)
assert_array_almost_equal(d, [[2., 2.],
[2., 2.]])

def test_chebyshev_distance(self):
x = array([1, 3])
w = ones((2, 2, 2))
d = self.som._chebyshev_distance(x, w)
assert_array_almost_equal(d, [[2., 2.],
[2., 2.]])

def test_check_input_len(self):
with self.assertRaises(ValueError):
self.som.train_batch([[1, 2]], 1)

with self.assertRaises(ValueError):
self.som.random_weights_init(array([[1, 2]]))

with self.assertRaises(ValueError):
self.som._check_input_len(array([[1, 2]]))

self.som._check_input_len(array([[1]]))
self.som._check_input_len([[1]])

def test_unavailable_neigh_function(self):
with self.assertRaises(ValueError):
MiniSom(5, 5, 1, neighborhood_function='boooom')

def test_unavailable_distance_function(self):
with self.assertRaises(ValueError):
MiniSom(5, 5, 1, activation_distance='ridethewave')

def test_gaussian(self):
bell = self.som._gaussian((2, 2), 1)
assert bell.max() == 1.0
assert bell.argmax() == 12 # unravel(12) = (2,2)

def test_mexican_hat(self):
bell = self.som._mexican_hat((2, 2), 1)
assert bell.max() == 1.0
assert bell.argmax() == 12 # unravel(12) = (2,2)

def test_bubble(self):
bubble = self.som._bubble((2, 2), 1)
assert bubble[2, 2] == 1
assert sum(sum(bubble)) == 1

def test_triangle(self):
bubble = self.som._triangle((2, 2), 1)
assert bubble[2, 2] == 1
assert sum(sum(bubble)) == 1

def test_win_map(self):
winners = self.som.win_map([[5.0], [2.0]])
assert winners[(2, 3)][0] == [5.0]
assert winners[(1, 1)][0] == [2.0]

def test_win_map_indices(self):
winners = self.som.win_map([[5.0], [2.0]], return_indices=True)
assert winners[(2, 3)] == [0]
assert winners[(1, 1)] == [1]

def test_labels_map(self):
labels_map = self.som.labels_map([[5.0], [2.0]], ['a', 'b'])
assert labels_map[(2, 3)]['a'] == 1
assert labels_map[(1, 1)]['b'] == 1
with self.assertRaises(ValueError):
self.som.labels_map([[5.0]], ['a', 'b'])

def test_activation_reponse(self):
response = self.som.activation_response([[5.0], [2.0]])
assert response[2, 3] == 1
assert response[1, 1] == 1

def test_activate(self):
assert self.som.activate(5.0).argmin() == 13.0 # unravel(13) = (2,3)

def test_distance_from_weights(self):
data = arange(-5, 5).reshape(-1, 1)
weights = self.som._weights.reshape(-1, self.som._weights.shape[2])
distances = self.som._distance_from_weights(data)
for i in range(len(data)):
for j in range(len(weights)):
assert(distances[i][j] == norm(data[i] - weights[j]))

def test_quantization_error(self):
assert self.som.quantization_error([[5], [2]]) == 0.0
assert self.som.quantization_error([[4], [1]]) == 1.0

def test_topographic_error(self):
# 5 will have bmu_1 in (2,3) and bmu_2 in (2, 4)
# which are in the same neighborhood
self.som._weights[2, 4] = 6.0
# 15 will have bmu_1 in (4, 4) and bmu_2 in (0, 0)
# which are not in the same neighborhood
self.som._weights[4, 4] = 15.0
self.som._weights[0, 0] = 14.
assert self.som.topographic_error([[5]]) == 0.0
assert self.som.topographic_error([[15]]) == 1.0

self.som.topology = 'hexagonal'
with self.assertRaises(NotImplementedError):
assert self.som.topographic_error([[5]]) == 0.0
self.som.topology = 'rectangular'

def test_quantization(self):
q = self.som.quantization(array([[4], [2]]))
assert q[0] == 5.0
assert q[1] == 2.0

def test_random_seed(self):
som1 = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)
som2 = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)
# same initialization
assert_array_almost_equal(som1._weights, som2._weights)
data = random.rand(100, 2)
som1 = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)
som1.train_random(data, 10)
som2 = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)
som2.train_random(data, 10)
# same state after training
assert_array_almost_equal(som1._weights, som2._weights)

def test_train_batch(self):
som = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)
data = array([[4, 2], [3, 1]])
q1 = som.quantization_error(data)
som.train(data, 10)
assert q1 > som.quantization_error(data)

data = array([[1, 5], [6, 7]])
q1 = som.quantization_error(data)
som.train_batch(data, 10, verbose=True)
assert q1 > som.quantization_error(data)

def test_train_random(self):
som = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)
data = array([[4, 2], [3, 1]])
q1 = som.quantization_error(data)
som.train(data, 10, random_order=True)
assert q1 > som.quantization_error(data)

data = array([[1, 5], [6, 7]])
q1 = som.quantization_error(data)
som.train_random(data, 10, verbose=True)
assert q1 > som.quantization_error(data)

def test_random_weights_init(self):
som = MiniSom(2, 2, 2, random_seed=1)
som.random_weights_init(array([[1.0, .0]]))
for w in som._weights:
assert_array_equal(w[0], array([1.0, .0]))

def test_pca_weights_init(self):
som = MiniSom(2, 2, 2)
som.pca_weights_init(array([[1., 0.], [0., 1.], [1., 0.], [0., 1.]]))
expected = array([[[0., -1.41421356], [-1.41421356, 0.]],
[[1.41421356, 0.], [0., 1.41421356]]])
assert_array_almost_equal(som._weights, expected)

def test_distance_map(self):
som = MiniSom(2, 2, 2, random_seed=1)
som._weights = array([[[1., 0.], [0., 1.]], [[1., 0.], [0., 1.]]])
assert_array_equal(som.distance_map(), array([[1., 1.], [1., 1.]]))

som = MiniSom(2, 2, 2, topology='hexagonal', random_seed=1)
som._weights = array([[[1., 0.], [0., 1.]], [[1., 0.], [0., 1.]]])
assert_array_equal(som.distance_map(), array([[.5, 1.], [1., .5]]))

def test_pickling(self):
with open('som.p', 'wb') as outfile:
pickle.dump(self.som, outfile)
with open('som.p', 'rb') as infile:
pickle.load(infile)
os.remove('som.p')

应用：

import numpy as np
from minisom1 import MiniSom
import pickle

"""Initializes a Self Organizing Maps.
A rule of thumb to set the size of the grid for a dimensionality
reduction task is that it should contain 5*sqrt(N) neurons
where N is the number of samples in the dataset to analyze.
E.g. if your dataset has 150 samples, 5*sqrt(150) = 61.23
hence a map 8-by-8 should perform well.
Parameters
----------
x : int
x dimension of the SOM. 竞争层的X维度
y : int
y dimension of the SOM.竞争层的Y维度
input_len : int 输入向量的长度
Number of the elements of the vectors in input.
sigma : float, optional (default=1.0) 近邻函数的扩展参数，需要与som竞争层的大小相适应，默认初始值为1，该参数随时间减小
Spread of the neighborhood function, needs to be adequate
to the dimensions of the map.
(at the iteration t we have sigma(t) = sigma / (1 + t/T)
where T is #num_iteration/2)
learning_rate : initial learning rate 学习率，随着迭代次数减小
(at the iteration t we have
learning_rate(t) = learning_rate / (1 + t/T)
where T is #num_iteration/2)
decay_function : function (default=None) 学习率的下降函数降低学习率与缩小扩展范围使用的是同一个函数
Function that reduces learning_rate and sigma at each iteration
the default function is:
learning_rate / (1+t/(max_iterarations/2))
A custom decay function will need to to take in input
three parameters in the following order:
1. learning rate
2. current iteration
3. maximum number of iterations allowed
Note that if a lambda function is used to define the decay
MiniSom will not be pickable anymore.
neighborhood_function : string, optional (default='gaussian') 近邻函数，定义近邻关系
Function that weights the neighborhood of a position in the map.
Possible values: 'gaussian', 'mexican_hat', 'bubble', 'triangle' 可用的取值包括：高斯关系，墨西哥帽子，气泡，三角形
topology : string, optional (default='rectangular') 拓扑关系，默认是矩形关系，还可以选择六边形关系
Topology of the map.
Possible values: 'rectangular', 'hexagonal'
activation_distance : string, optional (default='euclidean') 激活距离，默认采用的是欧式距离
Distance used to activate the map.
Possible values: 'euclidean', 'cosine', 'manhattan', 'chebyshev'
random_seed : int, optional (default=None) 随机种子
Random seed to use.
"""

"""数据"""
data = [[ 0.80, 0.55, 0.22, 0.03],
[ 0.82, 0.50, 0.23, 0.03],
[ 0.80, 0.54, 0.22, 0.03],
[ 0.80, 0.53, 0.26, 0.03],
[ 0.79, 0.56, 0.22, 0.03],
[ 0.75, 0.60, 0.25, 0.03],
[ 0.77, 0.59, 0.22, 0.03]]

"""定义网络"""
som = MiniSom(x=5, y=5, input_len=4,neighborhood_function="gaussian", sigma=2, learning_rate=0.5)
# 创建一个 5x5 的 SOM
# x: 竞争层的X维度
# y: 竞争层的y维度
# input_len: 输入层的维度
# neighborhood_function：近邻函数，可选择：'gaussian', 'mexican_hat', 'bubble', 'triangle'
# sigma：近邻函数的参数
# topology：拓扑关系，默认是矩形关系。候选：'euclidean', 'cosine', 'manhattan', 'chebyshev'
# decay_function : 学习率的下降函数降低学习率与缩小扩展范围使用的是同一个函数
# 默认：(default=None) ： learning_rate / (1+t/(max_iterarations/2))

"""训练网络"""
som.train(data, 10000) # 迭代训练10000 次

"""查看某个样本的对应的获胜神经元"""
sample = data[0]
winner_neuro_index = som.winner(sample) # 返回神经元的坐标
print(winner_neuro_index) # 打印的是被激活的神经元的索引

"""打印SOM的权重"""
# 获取所有权重
som_weights = som.get_weights()
print(som_weights)

# 获取特定索引位置的神经元的权重,是一个与输入向量等长的向量
i,j = winner_neuro_index[0],winner_neuro_index[1]
neuro_weight = som_weights[i,j,:]
print(neuro_weight)

"""
得到距离数据集最近的神经元的权重，并把这个权重重复若干次（参数中样本的个数），输出。
"""
data_som_weights = som.quantization(data[0:3])
print(data_som_weights)

"""模型的保存与加载"""
with open('som.p', 'wb') as outfile:
pickle.dump(som, outfile)

with open('som.p', 'rb') as infile:
som = pickle.load(infile)