关于Linux：如何使用python用硬链接替换重复的文件？

How to replace duplicate files with hard links using python?

我是个摄影师，做很多备份。多年来，我发现自己有很多硬盘。现在我买了一个NAS，并使用rsync在一个3tbraid1上复制了我的所有图片。根据我的脚本，这些文件中大约有1TB是重复的。这是因为在删除笔记本电脑上的文件之前做了多次备份，而且非常混乱。我确实在旧硬盘上备份了所有这些文件，但如果我的脚本把事情搞砸了，那将是一件痛苦的事。你能看看我的重复查找脚本，告诉我你认为我能运行它吗？我在一个测试文件夹上试过了，看起来还可以，但我不想把NAS搞得一团糟。

脚本在三个文件中有三个步骤。在第一部分中，我找到了所有的图像和元数据文件，并将它们放在一个搁置数据库(datenbank)中，其大小作为键。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

import os
import shelve

datenbank = shelve.open(os.path.join(os.path.dirname(__file__),"shelve_step1"), flag='c', protocol=None, writeback=False)

#path_to_search = os.path.join(os.path.dirname(__file__),"test")
path_to_search ="/volume1/backup_2tb_wd/"
file_exts = ["xmp","jpg","JPG","XMP","cr2","CR2","PNG","png","tiff","TIFF"]
walker = os.walk(path_to_search)

counter = 0

for dirpath, dirnames, filenames in walker:
if filenames:
for filename in filenames:
counter += 1
print str(counter)
for file_ext in file_exts:
if file_ext in filename:
filepath = os.path.join(dirpath, filename)
filesize = str(os.path.getsize(filepath))
if not filesize in datenbank:
datenbank[filesize] = []
tmp = datenbank[filesize]
if filepath not in tmp:
tmp.append(filepath)
datenbank[filesize] = tmp

datenbank.sync()
print"done"
datenbank.close()

第二部分。现在，我删除列表中只有一个文件的所有文件大小，并创建另一个搁置数据库，其中MD5哈希为键，文件列表为值。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

import os
import shelve
import hashlib

datenbank = shelve.open(os.path.join(os.path.dirname(__file__),"shelve_step1"), flag='c', protocol=None, writeback=False)

datenbank_step2 = shelve.open(os.path.join(os.path.dirname(__file__),"shelve_step2"), flag='c', protocol=None, writeback=False)

counter = 0
space = 0

def md5Checksum(filePath):
with open(filePath, 'rb') as fh:
m = hashlib.md5()
while True:
data = fh.read(8192)
if not data:
break
m.update(data)
return m.hexdigest()

for filesize in datenbank:
filepaths = datenbank[filesize]
filepath_count = len(filepaths)
if filepath_count > 1:
counter += filepath_count -1
space += (filepath_count -1) * int(filesize)
for filepath in filepaths:
print counter
checksum = md5Checksum(filepath)
if checksum not in datenbank_step2:
datenbank_step2[checksum] = []
temp = datenbank_step2[checksum]
if filepath not in temp:
temp.append(filepath)
datenbank_step2[checksum] = temp

print counter
print str(space)

datenbank_step2.sync()
datenbank_step2.close()
print"done"

最后是最危险的部分。对于evrey md5键，我检索文件列表并执行额外的sha1。如果匹配，我将删除该列表中除第一个文件外的所有文件，并创建一个硬链接来替换删除的文件。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

import os
import shelve
import hashlib

datenbank = shelve.open(os.path.join(os.path.dirname(__file__),"shelve_step2"), flag='c', protocol=None, writeback=False)

def sha1Checksum(filePath):
with open(filePath, 'rb') as fh:
m = hashlib.sha1()
while True:
data = fh.read(8192)
if not data:
break
m.update(data)
return m.hexdigest()

for hashvalue in datenbank:
switch = True
for path in datenbank[hashvalue]:
if switch:
original = path
original_checksum = sha1Checksum(path)
switch = False
else:
if sha1Checksum(path) == original_checksum:
os.unlink(path)
os.link(original, path)
print"delete:", path
print"done"

你怎么认为？非常感谢你。

*如果这在某种程度上很重要：它是一个概要713+并且有一个ext3或ext4文件系统。