按字节大小将大文本文件分割成较小的块并保留行尾

问题描述 投票:0回答:1

我有Python代码,它将文件分割成字节大小的较小块,例如filename.txt被1gb分割成10个100mb的小文件 - filename_001.txt,filename_002.txt等。

但是,尽管我对文件大小分割感到满意,但我注意到在这些较小的分割文件中,文件内容可以在行上的任何位置剪切,但我想保留该行并在行尾以字节大小分割.

请参阅我的以下代码:

import os
import sys

def getfilesize(sfilename):
    with open(sfilename,"rb") as fr:
        fr.seek(0,2) # move to end of the file
        size=fr.tell()
        print("getfilesize: size: %s" % size)
        return fr.tell()

def splitfile(spath, sfilename, splitsize, dpath):
    # Open original file in read only mode
    if not os.path.isfile(spath + sfilename):
        print("No such file as: \"%s\"" % spath + sfilename)
        return

    filesize=getfilesize(spath + sfilename)

    fullsfilepath = spath + sfilename
    fulldfilepath = dpath + sfilename

    with open(spath + sfilename,"rb") as fr:
        counter=1
        orginalfilename = fullsfilepath.split(".")
        newfilename = fulldfilepath.split(".")
        print(orginalfilename)
        readlimit = 5000 #read 5kb at a time
        n_splits = filesize//splitsize
        print("splitfile: No of splits required: %s" % str(n_splits))
        for i in range(n_splits+1):
            chunks_count = int(splitsize)//int(readlimit)
            data_5kb = fr.read(readlimit) # read
            # Create split files
            print("chunks_count: %d" % chunks_count)

            with open(newfilename[0]+"_{id}.".format(id=str(counter))+newfilename[1],"ab") as fw:
                fw.seek(0)
                fw.truncate()# truncate original if present
                while data_5kb:
                    fw.write(data_5kb)
                    if chunks_count:
                        chunks_count-=1
                        data_5kb = fr.read(readlimit)
                    else: break
            counter+=1

if __name__ == "__main__":
    if len(sys.argv) < 4: print("Missing argument: Usage: filesplit.py sfilename splitsizeinkb dfilename")
    else:
        filesize = int(sys.argv[3]) * 1000 #make into kb
        spath = sys.argv[1]
        sfilename = sys.argv[2]
        dpath = sys.argv[4]
        splitfile(spath, sfilename, filesize, dpath)

有可能实现这个目标吗?如果是这样怎么办?

python
1个回答
0
投票

也许您应该首先获得最大行大小,然后您就会知道应该如何设置块的大小。以下是可能有帮助的代码:

import os
import sys
from pathlib import Path

def lines_max_size(filepath: str):
    """
    iter file content by size and return max fize of them

    Args:
        filepath (str): path of file to iter
    """
    ans = 0
    with open(filepath, "r") as f:
        for line in f:
            size = sys.getsizeof(line)
            if size > ans:
                ans = size
    return ans


def split_file(filepath: str, chunk_size: int, outfile_prefix):
    """
    split large file to smaller files by lines, whiches size will lower than chunk_size

    Args:
         filepath (str): path of file to split
         chunk_size (str): size the files after spliting should be lower than, unit - bytes
         outfile_prefix (str): name of output files should be in the format - outfile_prefix+count_number
    """
    content = ""
    count = 1
    save_path = Path(filepath).parent
    with open(filepath, "r") as f:
        for line in f:
            line_size = sys.getsizeof(line)
            if line_size > chunk_size:
                raise Exception("current line size is %s, chunk_size - %s too small to splite file by lines" % (
                    line_size,
                    chunk_size
                ))

            if sys.getsizeof(content+line) <= chunk_size:
                content += line
            else:
                fn = outfile_prefix + str(count) + filepath.split(".")[-1]
                fp = save_path.joinpath(fn)
                with open(fp, "w") as fw:
                    fw.write(content)
                content = line
                count += 1
    if content:
        fn = outfile_prefix + str(count) + filepath.split(".")[-1]
        fp = save_path.joinpath(fn)
        with open(fp, "w") as fw:
            fw.write(content)


def list_files_content_size(path: Path, file_prefix: str):
    """
    under path, list size of file's content, those files' name starts with file_prefix

    Args:
        path (Path): files under this path to list
        file_prefix (str): files name should start with 
"""
    for fn in os.listdir(path):
        if not fn.startswith(file_prefix):
            continue
        fp = path.joinpath(fn)
        with open(fp, "r") as f:
            size = sys.getsizeof(f.read())
            print(fn, str(size) + "bytes")


if __name__ == "__main__":
    filepath = "a.txt"
    #Firstly,get max fize of lines 
    print("max size of lines is: ", lines_max_size(filepath))

    # Then, split large file
    split_file(filepath="a.txt", chunk_size=240, outfile_prefix="output")

    # list out files's size
    current_path = Path(__file__).parent
    list_files_content_size(current_path, "output")
© www.soinside.com 2019 - 2024. All rights reserved.