假设我的结构是这样的
/-- am here
/one/some/dir
/two
/three/has/many/leaves
/hello/world
然后说 /one/some/dir 包含一个大文件,500mb,并且 /three/has/many/leaves 在每个文件夹中包含一个 400mb 的文件。
我想生成每个目录的大小,得到这个输出
/ - in total for all
/one/some/dir 500mb
/two 0
/three/has/many/leaved - 400mb
/three/has/many 800
/three/has/ 800+someotherbigfilehere
我该怎么做?
os.walk
。具体来说,文档有一个查找目录大小的示例:
import os
from os.path import join, getsize
for root, dirs, files in os.walk('python/Lib/email'):
print root, "consumes",
print sum(getsize(join(root, name)) for name in files),
print "bytes in", len(files), "non-directory files"
if 'CVS' in dirs:
dirs.remove('CVS') # don't visit CVS directories
这应该很容易根据您的目的进行修改。
这是回应您评论的未经测试的版本:
import os
from os.path import join, getsize
dirs_dict = {}
#We need to walk the tree from the bottom up so that a directory can have easy
# access to the size of its subdirectories.
for root, dirs, files in os.walk('python/Lib/email',topdown = False):
# Loop through every non directory file in this directory and sum their sizes
size = sum(getsize(join(root, name)) for name in files)
# Look at all of the subdirectories and add up their sizes from the `dirs_dict`
subdir_size = sum(dirs_dict[join(root,d)] for d in dirs)
# store the size of this directory (plus subdirectories) in a dict so we
# can access it later
my_size = dirs_dict[root] = size + subdir_size
print '%s: %d'%(root,my_size)
实际上,如果目录中有符号链接,@mgilson 的回答将不起作用。为了允许你必须这样做:
dirs_dict = {}
for root, dirs, files in os.walk(directory, topdown=False):
if os.path.islink(root):
dirs_dict[root] = 0L
else:
dir_size = getsize(root)
# Loop through every non directory file in this directory and sum their sizes
for name in files:
full_name = join(root, name)
if os.path.islink(full_name):
nsize = 0L
else:
nsize = getsize(full_name)
dirs_dict[full_name] = nsize
dir_size += nsize
# Look at all of the subdirectories and add up their sizes from the `dirs_dict`
subdir_size = 0L
for d in dirs:
full_d = join(root, d)
if os.path.islink(full_d):
dirs_dict[full_d] = 0L
else:
subdir_size += dirs_dict[full_d]
dirs_dict[root] = dir_size + subdir_size
以下脚本打印指定目录的所有子目录的目录大小。这个脚本应该独立于平台——Posix/Windows/etc。它还试图从缓存递归函数的调用中获益(如果可能)。如果省略参数,脚本将在当前目录中运行。输出按目录大小从大到小排序。所以你可以根据你的需要调整它。
PS 我使用食谱 578019 以人性化的格式显示目录大小
from __future__ import print_function
import os
import sys
import operator
def null_decorator(ob):
return ob
if sys.version_info >= (3,2,0):
import functools
my_cache_decorator = functools.lru_cache(maxsize=4096)
else:
my_cache_decorator = null_decorator
start_dir = os.path.normpath(os.path.abspath(sys.argv[1])) if len(sys.argv) > 1 else '.'
@my_cache_decorator
def get_dir_size(start_path = '.'):
total_size = 0
if 'scandir' in dir(os):
# using fast 'os.scandir' method (new in version 3.5)
for entry in os.scandir(start_path):
if entry.is_dir(follow_symlinks = False):
total_size += get_dir_size(entry.path)
elif entry.is_file(follow_symlinks = False):
total_size += entry.stat().st_size
else:
# using slow, but compatible 'os.listdir' method
for entry in os.listdir(start_path):
full_path = os.path.abspath(os.path.join(start_path, entry))
if os.path.islink(full_path):
continue
if os.path.isdir(full_path):
total_size += get_dir_size(full_path)
elif os.path.isfile(full_path):
total_size += os.path.getsize(full_path)
return total_size
def get_dir_size_walk(start_path = '.'):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
total_size += os.path.getsize(fp)
return total_size
def bytes2human(n, format='%(value).0f%(symbol)s', symbols='customary'):
"""
(c) http://code.activestate.com/recipes/578019/
Convert n bytes into a human readable string based on format.
symbols can be either "customary", "customary_ext", "iec" or "iec_ext",
see: https://en.wikipedia.org/wiki/Binary_prefix#Specific_units_of_IEC_60027-2_A.2_and_ISO.2FIEC_80000
>>> bytes2human(0)
'0.0 B'
>>> bytes2human(0.9)
'0.0 B'
>>> bytes2human(1)
'1.0 B'
>>> bytes2human(1.9)
'1.0 B'
>>> bytes2human(1024)
'1.0 K'
>>> bytes2human(1048576)
'1.0 M'
>>> bytes2human(1099511627776127398123789121)
'909.5 Y'
>>> bytes2human(9856, symbols="customary")
'9.6 K'
>>> bytes2human(9856, symbols="customary_ext")
'9.6 kilo'
>>> bytes2human(9856, symbols="iec")
'9.6 Ki'
>>> bytes2human(9856, symbols="iec_ext")
'9.6 kibi'
>>> bytes2human(10000, "%(value).1f %(symbol)s/sec")
'9.8 K/sec'
>>> # precision can be adjusted by playing with %f operator
>>> bytes2human(10000, format="%(value).5f %(symbol)s")
'9.76562 K'
"""
SYMBOLS = {
'customary' : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'),
'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
'zetta', 'iotta'),
'iec' : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
'iec_ext' : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
'zebi', 'yobi'),
}
n = int(n)
if n < 0:
raise ValueError("n < 0")
symbols = SYMBOLS[symbols]
prefix = {}
for i, s in enumerate(symbols[1:]):
prefix[s] = 1 << (i+1)*10
for symbol in reversed(symbols[1:]):
if n >= prefix[symbol]:
value = float(n) / prefix[symbol]
return format % locals()
return format % dict(symbol=symbols[0], value=n)
############################################################
###
### main ()
###
############################################################
if __name__ == '__main__':
dir_tree = {}
### version, that uses 'slow' [os.walk method]
#get_size = get_dir_size_walk
### this recursive version can benefit from caching the function calls (functools.lru_cache)
get_size = get_dir_size
for root, dirs, files in os.walk(start_dir):
for d in dirs:
dir_path = os.path.join(root, d)
if os.path.isdir(dir_path):
dir_tree[dir_path] = get_size(dir_path)
for d, size in sorted(dir_tree.items(), key=operator.itemgetter(1), reverse=True):
print('%s\t%s' %(bytes2human(size, format='%(value).2f%(symbol)s'), d))
print('-' * 80)
if sys.version_info >= (3,2,0):
print(get_dir_size.cache_info())
样本输出:
37.61M .\subdir_b
2.18M .\subdir_a
2.17M .\subdir_a\subdir_a_2
4.41K .\subdir_a\subdir_a_1
----------------------------------------------------------
CacheInfo(hits=2, misses=4, maxsize=4096, currsize=4)
我用这段代码实现了这个:
def get_dir_size(path=os.getcwd()):
total_size = 0
for dirpath, dirnames, filenames in os.walk(path):
dirsize = 0
for f in filenames:
fp = os.path.join(dirpath, f)
size = os.path.getsize(fp)
#print('\t',size, f)
#print(dirpath, dirnames, filenames,size)
dirsize += size
total_size += size
print('\t',dirsize, dirpath)
print(" {0:.2f} Kb".format(total_size/1024))
我使用
pathlib
模块实现了这一点。以下代码将计算给定目录树中每个子目录的正确目录大小。
注意: 如果您希望计算给定根目录的总大小 only 而不是使用此代码计算所有单独的子目录,那么您必须摆脱外部循环,即 -
和将for sub in subdir:
替换为ls = list(sub.rglob('*.*'))
并相应地更正缩进。ls = list(dir_path.rglob('*.*'))
所以,这是在
Python 3.7.6
上使用 Windows
生成的示例代码。
import os
from pathlib import Path
# Set home/root path
dir_path = Path('//?/C:/Downloads/.../.../.../.../...')
# IMP_NOTE: If the path is 265 characters long, which exceeds the classic MAX_PATH - 1 (259) character
# limit for DOS paths. Use an extended (verbatim) path such as "\\\\?\\C:\\" in order
# to access the full length that's supported by the filesystem -- about 32,760 characters.
# Alternatively, use Windows 10 with Python 3.6+ and enable long DOS paths in the registry.
# pathlib normalizes Windows paths to use backslash, so we can use
# Path('//?/D:/') without having to worry about escaping backslashes.
# Generate a complete list of sub-directories
subdir = list(x for x in dir_path.rglob('*') if x.is_dir())
for sub in subdir:
tot_dir_size = 0
ls = list(sub.rglob('*.*'))
# print(sub, '\n')
# print(len(ls), '\n')
for k in ls:
tot_dir_size += os.path.getsize(k)
# print(format(tot_dir_size, ',d'))
print("For Sub-directory: " + sub.parts[-1] + " ===> " +
"Size = " + str(format(tot_dir_size, ',d')) + "\n")
# path.parts ==> Provides a tuple giving access to the path’s various components
# (Ref.: pathlib documentation)
输出:
For Sub-directory: DIR_1 ===> Size = 5,600,621,618
For Sub-directory: DIR_2 ===> Size = 9,113,492,347
For Sub-directory: DIR_3 ===> Size = 928,986,489
For Sub-directory: DIR_4 ===> Size = 2,125,250,470
使用递归大小
pip install recursive-size
然后做(来自他们自己的文档)
from recursive_size import get_size
size = get_size('path/to/folder')
print(size)