AWS Glue python 脚本创建多个特定大小的文件不起作用

问题描述 投票:0回答:1

我有下面的

Python
脚本,当前它在
4MB
中生成几个大小为
S3 bucket
的 gz 文件。它默认是
AWS glue
所创造的。但现在我想在
100-250MB
中围绕
s3 bucket
创建多个特定大小的文件。我已经在 python 脚本中尝试了以下逻辑,但它不起作用,并且仍然创建了几个大小为
4MB
.

的 gz 文件。
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import datetime


args = getResolvedOptions(sys.argv, ['target_BucketName', 'JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

outputbucketname = args['target_BucketName']

timestamp = datetime.datetime.now().strftime("%Y%m%d")
filename = f"tbd{timestamp}"
output_path = f"{outputbucketname}/{filename}"


# Script generated for node AWS Glue Data Catalog
AWSGlueDataCatalog_node075257312 = glueContext.create_dynamic_frame.from_catalog(database="ardt", table_name="_ard_tbd", transformation_ctx="AWSGlueDataCatalog_node075257312")

# Script generated for node Amazon S3
AmazonS3_node075284688 = glueContext.write_dynamic_frame.from_options(frame=AWSGlueDataCatalog_node075257312, connection_type="s3", format="csv", format_options={"separator": "|"}, connection_options={"path": output_path, "compression": "gzip",  "recurse": True, "groupFiles": "inPartition", "groupSize": "100000000"}, transformation_ctx="AmazonS3_node075284688")


job.commit()
python amazon-s3 aws-glue
1个回答
0
投票

为此使用 pandas :

# Convert DynamicFrame to PySpark DataFrame
df_spark = AWSGlueDataCatalog_node075257312.toDF()

# Convert PySpark DataFrame to Pandas DataFrame
df_pandas = df_spark.toPandas()

# Determine the number of rows per file based on your desired file size
rows_per_file = 100000  # Adjust this number based on your data and desired file size

# Split the DataFrame into smaller DataFrames and save each to S3 in gzip format
for i in range(0, len(df_pandas), rows_per_file):
    df_chunk = df_pandas[i:i + rows_per_file]
    df_chunk.to_csv(
        f's3://your-bucket/your-output-path/part_{i // rows_per_file}.csv.gz', 
        index=False, 
        compression='gzip'
    )
© www.soinside.com 2019 - 2024. All rights reserved.