改善Python脚本删除文件和目录的运行时间

问题描述 投票:0回答:2

我有一个 Python 脚本,可以删除 X 天之前的文件和目录。然而,该脚本运行在一个包含数百万个文件和目录的巨大目录上。按照目前的速度,完成删除过程大约需要六周时间(查看磁盘空间指标)。

看来主要瓶颈在于列出文件和目录。有人可以建议代码更改或优化以帮助减少运行时间吗?

不确定它是否相关,但就上下文而言,它在 k8s 中作为作业运行,因此资源不是问题。

def delete_files(root_directory, delete_time_threshold):
    global EXAMINED_FILES
    global DELETED_FILES
    try:
        for dirpath, dirnames, filenames in os.walk(root_directory):
            for file in filenames:
                file_path = os.path.join(dirpath, file)
                try:
                    file_mtime = os.path.getmtime(file_path)
                    EXAMINED_FILES += 1
                    if file_mtime < delete_time_threshold:
                        os.remove(file_path)
                        logging.debug(f"File {file} deleted because mtime {file_mtime} is older than threshold {delete_time_threshold}")
                        DELETED_FILES += 1
                except Exception as e:
                    logging.error(f"Error deleting file {file_path}")
    except Exception as e:
        logging.error(f"Error walking root directory {root_directory}: {e}")

def delete_empty_directories(root_directory, allowed_empty_dirs):
    global EXAMINED_DIRS
    global DELETED_DIRS
    global SKIPPED_DELETE_DIRS
    try:
        for dirpath, dirnames, filenames in os.walk(root_directory):
            if dirpath != root_directory: #don't look at the root directory
                EXAMINED_DIRS += 1
                try:
                    if not dirnames and not filenames:
                        relative_dirpath = re.sub(f'^{root_directory}/', '', dirpath)
                        if relative_dirpath and relative_dirpath in allowed_empty_dirs:
                            logging.debug(f"Skipping deletion of allowed empty directory: {dirpath}")
                            SKIPPED_DELETE_DIRS += 1
                        else:
                            os.rmdir(dirpath)
                            logging.debug(f"Deleted empty directory: {dirpath}")
                            DELETED_DIRS += 1
                except Exception as e:
                    logging.error(f"Error deleting directory {dirpath}")
    except Exception as e:
        logging.error(f"Error walking root directory {root_directory}: {e}")


谢谢!

python python-3.x performance runtime
2个回答
0
投票

您可以使用多重处理,这将加快处理速度。

import os
import logging
import re
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

EXAMINED_FILES = 0
DELETED_FILES = 0
EXAMINED_DIRS = 0
DELETED_DIRS = 0
SKIPPED_DELETE_DIRS = 0


def process_file(file_path, delete_time_threshold):
    global EXAMINED_FILES
    global DELETED_FILES
    try:
        file_mtime = os.path.getmtime(file_path)
        EXAMINED_FILES += 1
        if file_mtime < delete_time_threshold:
            os.remove(file_path)
            logging.debug(f"File {file_path} deleted because mtime {file_mtime} is older than threshold {delete_time_threshold}")
            DELETED_FILES += 1
    except Exception as e:
        logging.error(f"Error processing file {file_path}: {e}")


def delete_files(root_directory, delete_time_threshold):
    with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
        for dirpath, dirnames, filenames in os.scandir(root_directory):
            file_paths = [os.path.join(dirpath, file) for file in filenames]
            executor.map(process_file, file_paths, [delete_time_threshold]*len(file_paths))


def delete_empty_directories(root_directory, allowed_empty_dirs):
    global EXAMINED_DIRS
    global DELETED_DIRS
    global SKIPPED_DELETE_DIRS
    try:
        for dirpath, dirnames, filenames in os.walk(root_directory, topdown=False):
            if dirpath != root_directory:
                EXAMINED_DIRS += 1
                try:
                    if not os.listdir(dirpath):
                        relative_dirpath = re.sub(f'^{root_directory}/', '', dirpath)
                        if relative_dirpath and relative_dirpath in allowed_empty_dirs:
                            logging.debug(f"Skipping deletion of allowed empty directory: {dirpath}")
                            SKIPPED_DELETE_DIRS += 1
                        else:
                            os.rmdir(dirpath)
                            logging.debug(f"Deleted empty directory: {dirpath}")
                            DELETED_DIRS += 1
                except Exception as e:
                    logging.error(f"Error deleting directory {dirpath}: {e}")
    except Exception as e:
        logging.error(f"Error walking root directory {root_directory}: {e}")


if __name__ == "__main__":
    root_directory = "/path/to/your/directory"
    delete_time_threshold = 1234567890  # Replace with your actual threshold
    allowed_empty_dirs = set()  # Add your allowed empty directories here

    delete_files(root_directory, delete_time_threshold)
    delete_empty_directories(root_directory, allowed_empty_dirs)

    print(f"Examined files: {EXAMINED_FILES}")
    print(f"Deleted files: {DELETED_FILES}")
    print(f"Examined directories: {EXAMINED_DIRS}")
    print(f"Deleted directories: {DELETED_DIRS}")
    print(f"Skipped allowed empty directories: {SKIPPED_DELETE_DIRS}")

0
投票

如何使用CLI命令?这些都经过了很好的优化。

© www.soinside.com 2019 - 2024. All rights reserved.