我有一个 Python 脚本,可以删除 X 天之前的文件和目录。然而,该脚本运行在一个包含数百万个文件和目录的巨大目录上。按照目前的速度,完成删除过程大约需要六周时间(查看磁盘空间指标)。
看来主要瓶颈在于列出文件和目录。有人可以建议代码更改或优化以帮助减少运行时间吗?
不确定它是否相关,但就上下文而言,它在 k8s 中作为作业运行,因此资源不是问题。
def delete_files(root_directory, delete_time_threshold):
global EXAMINED_FILES
global DELETED_FILES
try:
for dirpath, dirnames, filenames in os.walk(root_directory):
for file in filenames:
file_path = os.path.join(dirpath, file)
try:
file_mtime = os.path.getmtime(file_path)
EXAMINED_FILES += 1
if file_mtime < delete_time_threshold:
os.remove(file_path)
logging.debug(f"File {file} deleted because mtime {file_mtime} is older than threshold {delete_time_threshold}")
DELETED_FILES += 1
except Exception as e:
logging.error(f"Error deleting file {file_path}")
except Exception as e:
logging.error(f"Error walking root directory {root_directory}: {e}")
def delete_empty_directories(root_directory, allowed_empty_dirs):
global EXAMINED_DIRS
global DELETED_DIRS
global SKIPPED_DELETE_DIRS
try:
for dirpath, dirnames, filenames in os.walk(root_directory):
if dirpath != root_directory: #don't look at the root directory
EXAMINED_DIRS += 1
try:
if not dirnames and not filenames:
relative_dirpath = re.sub(f'^{root_directory}/', '', dirpath)
if relative_dirpath and relative_dirpath in allowed_empty_dirs:
logging.debug(f"Skipping deletion of allowed empty directory: {dirpath}")
SKIPPED_DELETE_DIRS += 1
else:
os.rmdir(dirpath)
logging.debug(f"Deleted empty directory: {dirpath}")
DELETED_DIRS += 1
except Exception as e:
logging.error(f"Error deleting directory {dirpath}")
except Exception as e:
logging.error(f"Error walking root directory {root_directory}: {e}")
谢谢!
您可以使用多重处理,这将加快处理速度。
import os
import logging
import re
from concurrent.futures import ProcessPoolExecutor
import multiprocessing
EXAMINED_FILES = 0
DELETED_FILES = 0
EXAMINED_DIRS = 0
DELETED_DIRS = 0
SKIPPED_DELETE_DIRS = 0
def process_file(file_path, delete_time_threshold):
global EXAMINED_FILES
global DELETED_FILES
try:
file_mtime = os.path.getmtime(file_path)
EXAMINED_FILES += 1
if file_mtime < delete_time_threshold:
os.remove(file_path)
logging.debug(f"File {file_path} deleted because mtime {file_mtime} is older than threshold {delete_time_threshold}")
DELETED_FILES += 1
except Exception as e:
logging.error(f"Error processing file {file_path}: {e}")
def delete_files(root_directory, delete_time_threshold):
with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
for dirpath, dirnames, filenames in os.scandir(root_directory):
file_paths = [os.path.join(dirpath, file) for file in filenames]
executor.map(process_file, file_paths, [delete_time_threshold]*len(file_paths))
def delete_empty_directories(root_directory, allowed_empty_dirs):
global EXAMINED_DIRS
global DELETED_DIRS
global SKIPPED_DELETE_DIRS
try:
for dirpath, dirnames, filenames in os.walk(root_directory, topdown=False):
if dirpath != root_directory:
EXAMINED_DIRS += 1
try:
if not os.listdir(dirpath):
relative_dirpath = re.sub(f'^{root_directory}/', '', dirpath)
if relative_dirpath and relative_dirpath in allowed_empty_dirs:
logging.debug(f"Skipping deletion of allowed empty directory: {dirpath}")
SKIPPED_DELETE_DIRS += 1
else:
os.rmdir(dirpath)
logging.debug(f"Deleted empty directory: {dirpath}")
DELETED_DIRS += 1
except Exception as e:
logging.error(f"Error deleting directory {dirpath}: {e}")
except Exception as e:
logging.error(f"Error walking root directory {root_directory}: {e}")
if __name__ == "__main__":
root_directory = "/path/to/your/directory"
delete_time_threshold = 1234567890 # Replace with your actual threshold
allowed_empty_dirs = set() # Add your allowed empty directories here
delete_files(root_directory, delete_time_threshold)
delete_empty_directories(root_directory, allowed_empty_dirs)
print(f"Examined files: {EXAMINED_FILES}")
print(f"Deleted files: {DELETED_FILES}")
print(f"Examined directories: {EXAMINED_DIRS}")
print(f"Deleted directories: {DELETED_DIRS}")
print(f"Skipped allowed empty directories: {SKIPPED_DELETE_DIRS}")
如何使用CLI命令?这些都经过了很好的优化。