Replace NaN's in NumPy array with closest non-NaN value
我有一个NumPy数组
1 2 3 4 5 | >>> str(a) '[ nan nan nan 1.44955726 1.44628034 1.44409573\ 1.4408188 1.43657094 1.43171624 1.42649744 1.42200684 1.42117704\ 1.42040255 1.41922908 nan nan nan nan\ nan nan]' |
我想用最接近的非NaN值替换每个NaN,以便将开头的所有NaN都设置为
我可以看到如何针对像这样的特定情况执行此操作,但是我通常需要能够对任意长度的数组执行此操作,并且在数组的开头和结尾都应具有任意长度的NaN(不会有NaN位于数字中间)。有什么想法吗?
用
作为替代解决方案(这还将对中间的数组
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | import numpy as np # Generate data... data = np.random.random(10) data[:2] = np.nan data[-1] = np.nan data[4:6] = np.nan print data # Fill in NaN's... mask = np.isnan(data) data[mask] = np.interp(np.flatnonzero(mask), np.flatnonzero(~mask), data[~mask]) print data |
这将产生:
1 2 3 4 5 | [ nan nan 0.31619306 0.25818765 nan nan 0.27410025 0.23347532 0.02418698 nan] [ 0.31619306 0.31619306 0.31619306 0.25818765 0.26349185 0.26879605 0.27410025 0.23347532 0.02418698 0.02418698] |
I want to replace each NaN with the closest non-NaN value... there will be no NaN's in the middle of the numbers
将执行以下操作:
1 2 3 4 | ind = np.where(~np.isnan(a))[0] first, last = ind[0], ind[-1] a[:first] = a[first] a[last + 1:] = a[last] |
这是直接的
1 | idx = np.nonzero(a==a)[0] |
现在很容易用所需的值替换nans:
1 2 3 4 | for i in range(0, idx[0]): a[i]=a[idx[0]] for i in range(idx[-1]+1, a.size) a[i]=a[idx[-1]] |
最后,我们可以将其放在函数中:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | import numpy as np def FixNaNs(arr): if len(arr.shape)>1: raise Exception("Only 1D arrays are supported.") idxs=np.nonzero(arr==arr)[0] if len(idxs)==0: return None ret=arr for i in range(0, idxs[0]): ret[i]=ret[idxs[0]] for i in range(idxs[-1]+1, ret.size): ret[i]=ret[idxs[-1]] return ret |
编辑
好吧,来自C的我总是忘记列表范围... @aix的解决方案比我的C ish循环更优雅,更高效,请使用它代替我的Cish循环。
我遇到了这个问题,不得不为分散的NaN找到一个定制的解决方案。下面的函数将任何NaN替换为右边的第一个数字,如果不存在,则将其替换为左边的第一个数字。可以进行进一步的处理,以将其替换为边界出现的平均值。
1 2 3 4 5 6 7 8 9 10 11 12 13 | import numpy as np Data = np.array([np.nan,1.3,np.nan,1.4,np.nan,np.nan]) nansIndx = np.where(np.isnan(Data))[0] isanIndx = np.where(~np.isnan(Data))[0] for nan in nansIndx: replacementCandidates = np.where(isanIndx>nan)[0] if replacementCandidates.size != 0: replacement = Data[isanIndx[replacementCandidates[0]]] else: replacement = Data[isanIndx[np.where(isanIndx<nan)[0][-1]]] Data[nan] = replacement |
结果是:
1 2 | >>> Data array([ 1.3, 1.3, 1.4, 1.4, 1.4, 1.4]) |
递归解决方案!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | def replace_leading_NaN(a, offset=0): if a[offset].isNaN(): new_value = replace_leading_NaN(a, offset + 1) a[offset] = new_value return new_value else: return a[offset] def replace_trailing_NaN(a, offset=-1): if a[offset].isNaN(): new_value = replace_trailing_NaN(a, offset - 1) a[offset] = new_value return new_value else: return a[offset] |
这是使用简单python迭代器的解决方案。实际上,它们在这里比
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | import numpy as np a = np.array([np.NAN, np.NAN, np.NAN, 1.44955726, 1.44628034, 1.44409573, 1.4408188, 1.43657094, 1.43171624, 1.42649744, 1.42200684, 1.42117704, 1.42040255, 1.41922908, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN]) mask = np.isfinite(a) # get first value in list for i in range(len(mask)): if mask[i]: first = i break # get last vaue in list for i in range(len(mask)-1, -1, -1): if mask[i]: last = i break # fill NaN with near known value on the edges a = np.copy(a) a[:first] = a[first] a[last + 1:] = a[last] print(a) |
输出:
1 2 3 4 | [1.44955726 1.44955726 1.44955726 1.44955726 1.44628034 1.44409573 1.4408188 1.43657094 1.43171624 1.42649744 1.42200684 1.42117704 1.42040255 1.41922908 1.41922908 1.41922908 1.41922908 1.41922908 1.41922908 1.41922908] |
它仅替换此处要求的第一个和最后一个NaN。
我有这样的东西
1 2 | i = [i for i in range(len(a)) if not np.isnan(a[i])] a = [a[i[0]] if x < i[0] else (a[i[-1]] if x > i[-1] else a[x]) for x in range(len(a))] |
这有点笨拙,因为如果将它们分成两行,并且其中之一是嵌套的内联。