我有下面的
Python
脚本,当前它在 4MB
中生成几个大小为 S3 bucket
的 gz 文件。它默认是 AWS glue
所创造的。但现在我想在 100-250MB
中围绕 s3 bucket
创建多个特定大小的文件。我已经在 python 脚本中尝试了以下逻辑,但它不起作用,并且仍然创建了几个大小为 4MB
. 的 gz 文件。
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import datetime
args = getResolvedOptions(sys.argv, ['target_BucketName', 'JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
outputbucketname = args['target_BucketName']
timestamp = datetime.datetime.now().strftime("%Y%m%d")
filename = f"tbd{timestamp}"
output_path = f"{outputbucketname}/{filename}"
# Script generated for node AWS Glue Data Catalog
AWSGlueDataCatalog_node075257312 = glueContext.create_dynamic_frame.from_catalog(database="ardt", table_name="_ard_tbd", transformation_ctx="AWSGlueDataCatalog_node075257312")
# Script generated for node Amazon S3
AmazonS3_node075284688 = glueContext.write_dynamic_frame.from_options(frame=AWSGlueDataCatalog_node075257312, connection_type="s3", format="csv", format_options={"separator": "|"}, connection_options={"path": output_path, "compression": "gzip", "recurse": True, "groupFiles": "inPartition", "groupSize": "100000000"}, transformation_ctx="AmazonS3_node075284688")
job.commit()
为此使用 pandas :
# Convert DynamicFrame to PySpark DataFrame
df_spark = AWSGlueDataCatalog_node075257312.toDF()
# Convert PySpark DataFrame to Pandas DataFrame
df_pandas = df_spark.toPandas()
# Determine the number of rows per file based on your desired file size
rows_per_file = 100000 # Adjust this number based on your data and desired file size
# Split the DataFrame into smaller DataFrames and save each to S3 in gzip format
for i in range(0, len(df_pandas), rows_per_file):
df_chunk = df_pandas[i:i + rows_per_file]
df_chunk.to_csv(
f's3://your-bucket/your-output-path/part_{i // rows_per_file}.csv.gz',
index=False,
compression='gzip'
)