将pdf文件处理为png

Question

我想在Python中使用aws将大型pdf文件（800页）处理为png文件。

您能给我推荐 lambda 函数的代码吗？

我尝试将多页 pdf 转换为 png，但 50 页出现运行时退出错误。

import json
import boto3
import zipfile
from io import BytesIO
from PIL import Image
import logging
import os
from datetime import datetime
import fitz
import urllib.parse

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize AWS clients
lambda_client = boto3.client("lambda")
dynamodb = boto3.resource("dynamodb")
s3 = boto3.client("s3")

# Specify the DynamoDB table name
TABLE_NAME = "XYZ"


# Function to update document status in DynamoDB
def update_document_status(batch_id, status, stage):
    try:
        doc_status_table_name = dynamodb.Table(TABLE_NAME)
        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        update_expression = (
            "SET #status = :status, #stage = :stage, #updated_date = :updated_date"
        )
        expression_attribute_names = {
            "#status": "status",
            "#stage": "stage",
            "#updated_date": "updated_date",
        }
        expression_attribute_values = {
            ":status": status,
            ":stage": stage,
            ":updated_date": current_time,
        }

        # If the status is either completed or failed, update end_time and update_time
        if status in ["completed", "failed"]:
            update_expression += ", #end_time = :end_time, #update_time = :update_time"
            expression_attribute_names["#end_time"] = "end_time"
            expression_attribute_names["#update_time"] = "update_time"
            expression_attribute_values[":end_time"] = current_time
            expression_attribute_values[":update_time"] = current_time

        response = doc_status_table_name.update_item(
            Key={"batch_id": batch_id},
            UpdateExpression=update_expression,
            ExpressionAttributeNames=expression_attribute_names,
            ExpressionAttributeValues=expression_attribute_values,
            ReturnValues="UPDATED_NEW",
        )
        logger.info("UpdateItem succeeded: %s", response)
        return True
    except Exception as e:
        logger.error("Error updating document status: %s", e)
        raise


# Function to convert multi-page TIFF file to PNG
def convert_multi_page_tiff_to_png(tiff_file):
    try:
        png_images = []
        with Image.open(tiff_file) as img:
            if img.n_frames > 1:
                for i in range(img.n_frames):
                    img.seek(i)
                    png_bytes = BytesIO()
                    img.save(png_bytes, format="PNG")
                    png_images.append(png_bytes.getvalue())
            else:
                # For single-page TIFF, directly convert it to PNG
                png_bytes = BytesIO()
                img.save(png_bytes, format="PNG")
                png_images.append(png_bytes.getvalue())
        return png_images
    except Exception as e:
        logger.error(f"Error converting multi-page TIFF to PNG: {e}")
        raise


# Function to convert multi-page PDF file to PNG
def convert_multi_page_pdf_to_png(pdf_data):
    try:
        png_images = []
        pdf_file = fitz.open(stream=pdf_data, filetype='pdf')
        for i in range(len(pdf_file)):
            pix = pdf_file[i].get_pixmap()

            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            png_bio = BytesIO()
            img.save(png_bio, format='PNG')

            png_images.append(png_bio.getvalue())            
    except Exception as e:
        print(f"Error converting multi-page PDF to PNG: {e}")
        raise
    
    return png_images

def process_zip_file(filepath):
    with zipfile.ZipFile(filepath, 'r') as zip_ref:
        for filename in zip_ref.namelist():
            print(f"processing file: {filename}")
            if filename.endswith('.tif') or filename.endswith('.tiff'):
                tiff_data = zip_ref.read(filename)
                png_images = convert_multi_page_tiff_to_png(tiff_data)
                
                # Only needed for local testing
                save_images(png_images, filename)
            elif filename.endswith('.pdf'):
                pdf_data = zip_ref.read(filename)
                png_images = convert_multi_page_pdf_to_png(pdf_data)
                
                # Only needed for local testing
                save_images(png_images, filename)
            else:
                print(f"Skipping processing of unsupported file format: {filename}")

def lambda_handler(event, context):
    try:
        batch_id = None
        logger.info("Received Event: %s", json.dumps(event, indent=2))

        folder_name = event.get("FILENAME")

        if folder_name is None:
            logger.error("Directory name not found in the event.")
            raise ValueError("Directory name not found in the event.")

        if not isinstance(folder_name, str):
            logger.error("Directory name is not a string.")
            raise ValueError("Directory name is not a string.")

        logger.info(f"Directory Name: {folder_name}")

        bucket_name = "cnc-aws-ecs-mecc-dev-doc-processing-v2"

        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)

        json_file_key = None
        matching_zip_file_key = None

        for obj in response.get("Contents", []):
            file_key = obj["Key"]
            logger.info(f"Processing file: {file_key}")

            if file_key.endswith(".json") and file_key.startswith(folder_name):
                json_file_key = file_key
            elif file_key.endswith(".zip") and file_key.startswith(folder_name):
                matching_zip_file_key = file_key

        logger.info(f"JSON file key: {json_file_key}")
        logger.info(f"Matching ZIP file key: {matching_zip_file_key}")

        if json_file_key is not None and matching_zip_file_key is not None:
            try:
                response = s3.get_object(Bucket=bucket_name, Key=json_file_key)
                data = json.loads(response['Body'].read().decode('utf-8'))
                batch_id = data['SCANMETADATA']['batchid']
                logger.info("Batch ID: %s", batch_id)

                zip_obj = s3.get_object(Bucket=bucket_name, Key=matching_zip_file_key)

                update_document_status(batch_id, "inprogress", "Image Conversion")

                pdf_data = zip_obj['Body'].read()  # Read PDF data from the ZIP file

                process_zip_file(pdf_data)  # Pass pdf_data to the function

                update_document_status(batch_id, "completed", "Image Conversion")

                s3.delete_object(Bucket=bucket_name, Key=json_file_key)
                logger.info(
                    "JSON file removed from original location: %s", json_file_key
                )

                s3.delete_object(Bucket=bucket_name, Key=matching_zip_file_key)
                logger.info("Original ZIP file deleted: %s", matching_zip_file_key)

                if json_file_key.startswith(folder_name):
                    s3.delete_object(Bucket=bucket_name, Key=folder_name + "/")
                    logger.info("Directory removed: %s", folder_name)

                return event, folder_name

            except Exception as e:
                logger.error(f"Error processing ZIP file {matching_zip_file_key}: {e}")
                update_document_status(batch_id, "error", "Image Conversion")
                raise e
        else:
            logger.error("Both JSON and matching ZIP files are not found.")
            raise Exception("Both JSON and matching ZIP files are not found.")

    except Exception as e:
        logger.error(f"Error: {e}")
        if batch_id:
            update_document_status(batch_id, "failed", "Image Conversion")
        raise e

错误： { "errorType": "运行时.退出错误", “errorMessage”：“RequestId：ddb8e3d0-0ab5-4462-ab8f-ae7797fb93ad错误：运行时退出并出现错误：信号：被杀死” }

Answer 1

您的 Lambda 函数可能内存不足。您必须将函数的

MemorySize

配置（如果您使用 SAM 或 CDK）配置为更高的值。附言。当您为函数分配更高级别的内存时，您也会获得更多的 vCPU。

将pdf文件处理为png

问题描述投票：0回答：1

1个回答

最新问题

将pdf文件处理为png

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1