with open(filename, "rb") as f:
def md5_sum(filename):
with open(filename, "rb") as f:
f.seek(0)
line = f.readline()
md5 = hashlib.md5()
md5.update(line)
while line:
line = f.readline()
md5.update(line)
f_md5 = md5.hexdigest()
print(f_md5)
return f_md5
我以一个1.4G的文件进行测试
4dcafe2e69d6d0a8979cb4eb7c26c3fd
耗时: 6.345685958862305
Current memory usage is 0.008476MB; Peak was 5.260233MB
耗时 6s,峰值内存占用5.2M
def md5_sum2(filename):
with open(filename, "rb") as f:
f.seek(0)
md5 = hashlib.md5()
for line in f.readlines():
md5.update(line)
f_md5 = md5.hexdigest()
print(f_md5)
return f_md5
一次性将文件读到内存中,如果内存不够会报错
4dcafe2e69d6d0a8979cb4eb7c26c3fd
耗时: 7.258216857910156
Current memory usage is 0.008476MB; Peak was 1654.899472MB
耗时 7.2s,峰值内存占用1654M
反而速度更慢!
def md5_sum3(filename):
def chunked_file_reader(file, block_size=1024 * 16):
"""生成器函数:分块读取文件内容,使用 iter 函数
"""
# 首先使用 partial(fp.read, block_size) 构造一个新的无需参数的函数
# 循环将不断返回 fp.read(block_size) 调用结果,直到其为 '' 时终止
for chunk in iter(partial(file.read, block_size), b''):
yield chunk
with open(filename, "rb") as f:
f.seek(0)
md5 = hashlib.md5()
for line in chunked_file_reader(f):
md5.update(line)
f_md5 = md5.hexdigest()
print(f_md5)
return f_md5
利用read block_size读取大小限制,构造生成器
4dcafe2e69d6d0a8979cb4eb7c26c3fd
耗时: 3.0910959243774414
Current memory usage is 0.014733MB; Peak was 0.053139MB
耗时 3,峰值内存占用0.05M
def md5_sum4(filename):
buffer = 1024 * 1024
with open(filename, "rb") as f:
buf_gen = takewhile(lambda x: x, (f.read(buffer) for _ in repeat(None)))
md5 = hashlib.md5()
for line in buf_gen:
md5.update(line)
f_md5 = md5.hexdigest()
print(f_md5)
return f_md5
itertools 有更好的for loop 性能
这里切片大小设置为 1024*1024
4dcafe2e69d6d0a8979cb4eb7c26c3fd
耗时: 2.7328171730041504
Current memory usage is 0.00074MB; Peak was 2.103334MB
耗时 2.7s,峰值内存占用2.1M