我有Python代码,它将文件分割成字节大小的较小块,例如filename.txt被1gb分割成10个100mb的小文件 - filename_001.txt,filename_002.txt等。
但是,尽管我对文件大小分割感到满意,但我注意到在这些较小的分割文件中,文件内容可以在行上的任何位置剪切,但我想保留该行并在行尾以字节大小分割.
请参阅我的以下代码:
import os
import sys
def getfilesize(sfilename):
with open(sfilename,"rb") as fr:
fr.seek(0,2) # move to end of the file
size=fr.tell()
print("getfilesize: size: %s" % size)
return fr.tell()
def splitfile(spath, sfilename, splitsize, dpath):
# Open original file in read only mode
if not os.path.isfile(spath + sfilename):
print("No such file as: \"%s\"" % spath + sfilename)
return
filesize=getfilesize(spath + sfilename)
fullsfilepath = spath + sfilename
fulldfilepath = dpath + sfilename
with open(spath + sfilename,"rb") as fr:
counter=1
orginalfilename = fullsfilepath.split(".")
newfilename = fulldfilepath.split(".")
print(orginalfilename)
readlimit = 5000 #read 5kb at a time
n_splits = filesize//splitsize
print("splitfile: No of splits required: %s" % str(n_splits))
for i in range(n_splits+1):
chunks_count = int(splitsize)//int(readlimit)
data_5kb = fr.read(readlimit) # read
# Create split files
print("chunks_count: %d" % chunks_count)
with open(newfilename[0]+"_{id}.".format(id=str(counter))+newfilename[1],"ab") as fw:
fw.seek(0)
fw.truncate()# truncate original if present
while data_5kb:
fw.write(data_5kb)
if chunks_count:
chunks_count-=1
data_5kb = fr.read(readlimit)
else: break
counter+=1
if __name__ == "__main__":
if len(sys.argv) < 4: print("Missing argument: Usage: filesplit.py sfilename splitsizeinkb dfilename")
else:
filesize = int(sys.argv[3]) * 1000 #make into kb
spath = sys.argv[1]
sfilename = sys.argv[2]
dpath = sys.argv[4]
splitfile(spath, sfilename, filesize, dpath)
有可能实现这个目标吗?如果是这样怎么办?
也许您应该首先获得最大行大小,然后您就会知道应该如何设置块的大小。以下是可能有帮助的代码:
import os
import sys
from pathlib import Path
def lines_max_size(filepath: str):
"""
iter file content by size and return max fize of them
Args:
filepath (str): path of file to iter
"""
ans = 0
with open(filepath, "r") as f:
for line in f:
size = sys.getsizeof(line)
if size > ans:
ans = size
return ans
def split_file(filepath: str, chunk_size: int, outfile_prefix):
"""
split large file to smaller files by lines, whiches size will lower than chunk_size
Args:
filepath (str): path of file to split
chunk_size (str): size the files after spliting should be lower than, unit - bytes
outfile_prefix (str): name of output files should be in the format - outfile_prefix+count_number
"""
content = ""
count = 1
save_path = Path(filepath).parent
with open(filepath, "r") as f:
for line in f:
line_size = sys.getsizeof(line)
if line_size > chunk_size:
raise Exception("current line size is %s, chunk_size - %s too small to splite file by lines" % (
line_size,
chunk_size
))
if sys.getsizeof(content+line) <= chunk_size:
content += line
else:
fn = outfile_prefix + str(count) + filepath.split(".")[-1]
fp = save_path.joinpath(fn)
with open(fp, "w") as fw:
fw.write(content)
content = line
count += 1
if content:
fn = outfile_prefix + str(count) + filepath.split(".")[-1]
fp = save_path.joinpath(fn)
with open(fp, "w") as fw:
fw.write(content)
def list_files_content_size(path: Path, file_prefix: str):
"""
under path, list size of file's content, those files' name starts with file_prefix
Args:
path (Path): files under this path to list
file_prefix (str): files name should start with
"""
for fn in os.listdir(path):
if not fn.startswith(file_prefix):
continue
fp = path.joinpath(fn)
with open(fp, "r") as f:
size = sys.getsizeof(f.read())
print(fn, str(size) + "bytes")
if __name__ == "__main__":
filepath = "a.txt"
#Firstly,get max fize of lines
print("max size of lines is: ", lines_max_size(filepath))
# Then, split large file
split_file(filepath="a.txt", chunk_size=240, outfile_prefix="output")
# list out files's size
current_path = Path(__file__).parent
list_files_content_size(current_path, "output")