我想在Python中使用aws将大型pdf文件(800页)处理为png文件。
您能给我推荐 lambda 函数的代码吗?
我尝试将多页 pdf 转换为 png,但 50 页出现运行时退出错误。
import json
import boto3
import zipfile
from io import BytesIO
from PIL import Image
import logging
import os
from datetime import datetime
import fitz
import urllib.parse
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize AWS clients
lambda_client = boto3.client("lambda")
dynamodb = boto3.resource("dynamodb")
s3 = boto3.client("s3")
# Specify the DynamoDB table name
TABLE_NAME = "XYZ"
# Function to update document status in DynamoDB
def update_document_status(batch_id, status, stage):
try:
doc_status_table_name = dynamodb.Table(TABLE_NAME)
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
update_expression = (
"SET #status = :status, #stage = :stage, #updated_date = :updated_date"
)
expression_attribute_names = {
"#status": "status",
"#stage": "stage",
"#updated_date": "updated_date",
}
expression_attribute_values = {
":status": status,
":stage": stage,
":updated_date": current_time,
}
# If the status is either completed or failed, update end_time and update_time
if status in ["completed", "failed"]:
update_expression += ", #end_time = :end_time, #update_time = :update_time"
expression_attribute_names["#end_time"] = "end_time"
expression_attribute_names["#update_time"] = "update_time"
expression_attribute_values[":end_time"] = current_time
expression_attribute_values[":update_time"] = current_time
response = doc_status_table_name.update_item(
Key={"batch_id": batch_id},
UpdateExpression=update_expression,
ExpressionAttributeNames=expression_attribute_names,
ExpressionAttributeValues=expression_attribute_values,
ReturnValues="UPDATED_NEW",
)
logger.info("UpdateItem succeeded: %s", response)
return True
except Exception as e:
logger.error("Error updating document status: %s", e)
raise
# Function to convert multi-page TIFF file to PNG
def convert_multi_page_tiff_to_png(tiff_file):
try:
png_images = []
with Image.open(tiff_file) as img:
if img.n_frames > 1:
for i in range(img.n_frames):
img.seek(i)
png_bytes = BytesIO()
img.save(png_bytes, format="PNG")
png_images.append(png_bytes.getvalue())
else:
# For single-page TIFF, directly convert it to PNG
png_bytes = BytesIO()
img.save(png_bytes, format="PNG")
png_images.append(png_bytes.getvalue())
return png_images
except Exception as e:
logger.error(f"Error converting multi-page TIFF to PNG: {e}")
raise
# Function to convert multi-page PDF file to PNG
def convert_multi_page_pdf_to_png(pdf_data):
try:
png_images = []
pdf_file = fitz.open(stream=pdf_data, filetype='pdf')
for i in range(len(pdf_file)):
pix = pdf_file[i].get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
png_bio = BytesIO()
img.save(png_bio, format='PNG')
png_images.append(png_bio.getvalue())
except Exception as e:
print(f"Error converting multi-page PDF to PNG: {e}")
raise
return png_images
def process_zip_file(filepath):
with zipfile.ZipFile(filepath, 'r') as zip_ref:
for filename in zip_ref.namelist():
print(f"processing file: {filename}")
if filename.endswith('.tif') or filename.endswith('.tiff'):
tiff_data = zip_ref.read(filename)
png_images = convert_multi_page_tiff_to_png(tiff_data)
# Only needed for local testing
save_images(png_images, filename)
elif filename.endswith('.pdf'):
pdf_data = zip_ref.read(filename)
png_images = convert_multi_page_pdf_to_png(pdf_data)
# Only needed for local testing
save_images(png_images, filename)
else:
print(f"Skipping processing of unsupported file format: {filename}")
def lambda_handler(event, context):
try:
batch_id = None
logger.info("Received Event: %s", json.dumps(event, indent=2))
folder_name = event.get("FILENAME")
if folder_name is None:
logger.error("Directory name not found in the event.")
raise ValueError("Directory name not found in the event.")
if not isinstance(folder_name, str):
logger.error("Directory name is not a string.")
raise ValueError("Directory name is not a string.")
logger.info(f"Directory Name: {folder_name}")
bucket_name = "cnc-aws-ecs-mecc-dev-doc-processing-v2"
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)
json_file_key = None
matching_zip_file_key = None
for obj in response.get("Contents", []):
file_key = obj["Key"]
logger.info(f"Processing file: {file_key}")
if file_key.endswith(".json") and file_key.startswith(folder_name):
json_file_key = file_key
elif file_key.endswith(".zip") and file_key.startswith(folder_name):
matching_zip_file_key = file_key
logger.info(f"JSON file key: {json_file_key}")
logger.info(f"Matching ZIP file key: {matching_zip_file_key}")
if json_file_key is not None and matching_zip_file_key is not None:
try:
response = s3.get_object(Bucket=bucket_name, Key=json_file_key)
data = json.loads(response['Body'].read().decode('utf-8'))
batch_id = data['SCANMETADATA']['batchid']
logger.info("Batch ID: %s", batch_id)
zip_obj = s3.get_object(Bucket=bucket_name, Key=matching_zip_file_key)
update_document_status(batch_id, "inprogress", "Image Conversion")
pdf_data = zip_obj['Body'].read() # Read PDF data from the ZIP file
process_zip_file(pdf_data) # Pass pdf_data to the function
update_document_status(batch_id, "completed", "Image Conversion")
s3.delete_object(Bucket=bucket_name, Key=json_file_key)
logger.info(
"JSON file removed from original location: %s", json_file_key
)
s3.delete_object(Bucket=bucket_name, Key=matching_zip_file_key)
logger.info("Original ZIP file deleted: %s", matching_zip_file_key)
if json_file_key.startswith(folder_name):
s3.delete_object(Bucket=bucket_name, Key=folder_name + "/")
logger.info("Directory removed: %s", folder_name)
return event, folder_name
except Exception as e:
logger.error(f"Error processing ZIP file {matching_zip_file_key}: {e}")
update_document_status(batch_id, "error", "Image Conversion")
raise e
else:
logger.error("Both JSON and matching ZIP files are not found.")
raise Exception("Both JSON and matching ZIP files are not found.")
except Exception as e:
logger.error(f"Error: {e}")
if batch_id:
update_document_status(batch_id, "failed", "Image Conversion")
raise e
错误: { "errorType": "运行时.退出错误", “errorMessage”:“RequestId:ddb8e3d0-0ab5-4462-ab8f-ae7797fb93ad错误:运行时退出并出现错误:信号:被杀死” }
您的 Lambda 函数可能内存不足。您必须将函数的
MemorySize
配置(如果您使用 SAM 或 CDK)配置为更高的值。附言。当您为函数分配更高级别的内存时,您也会获得更多的 vCPU。