[使用iText Java在AWS S3上提取pdf附件

问题描述 投票:-1回答:1

我正在使用下面的iText Java代码从PDF文件中提取附件。在本地系统上可以正常工作。它从PDF提取XML文件并存储在strOutputPath上。我想在AWS S3上执行此操作。 PDF文件将在S3上,并且附件应在S3上提取。在这种情况下,如何在S3上使用文件的绝对路径。我使用了s3client.getUrl()。toExternalForm();,但收到HTTP 403错误。

import java.util.Iterator;
import java.util.Set;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.File;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.PdfArray;
import com.itextpdf.text.pdf.PdfDictionary;
import java.io.IOException;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;

public class app
{
    public static void main(final String[] args) {
        try {
            final String strInputPath = args[0];
            final String strOutputPath = args[1];
            final PdfReader pdfReader = new PdfReader(strInputPath);
            final PdfDictionary catalog = pdfReader.getCatalog();

            final PdfDictionary names = catalog.getAsDict(PdfName.NAMES);
            final PdfDictionary embeddedFiles = names.getAsDict(PdfName.EMBEDDEDFILES);
            final PdfArray embeddedFilesArray = embeddedFiles.getAsArray(PdfName.NAMES);

            for (int i = 0; i < embeddedFilesArray.size(); ++i) {
                final PdfDictionary FileSpec = embeddedFilesArray.getAsDict(i);
                if (FileSpec != null) {
                    String strFileName = FileSpec.getAsString(PdfName.F).toString();
                    System.out.println(strFileName);
                    if (strFileName.endsWith(".xml")) {
                        strFileName = String.valueOf(System.currentTimeMillis()) + ".xml";
                        extractFiles(pdfReader, FileSpec, String.valueOf(strOutputPath) + strFileName);
                    }
                }
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    private static void extractFiles(final PdfReader pdfReader, final PdfDictionary filespec, final String strFileName) {
        final PdfDictionary refs = filespec.getAsDict(PdfName.EF);
        PRStream prStream = null;
        FileOutputStream outputStream = null;
        final Set<PdfName> keys = (Set<PdfName>)refs.getKeys();
        try {
            for (final PdfName key : keys) {
                prStream = (PRStream)PdfReader.getPdfObject((PdfObject)refs.getAsIndirectObject(key));
                outputStream = new FileOutputStream(new File(strFileName));
                outputStream.write(PdfReader.getStreamBytes(prStream));
                outputStream.flush();
                outputStream.close();
            }
        }
        catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        catch (IOException e2) {
            e2.printStackTrace();
        }
        finally {
            try {
                if (outputStream != null) {
                    outputStream.close();
                }
            }
            catch (IOException e3) {
                e3.printStackTrace();
            }
        }
        try {
            if (outputStream != null) {
                outputStream.close();
            }
        }
        catch (IOException e3) {
            e3.printStackTrace();
        }
    }
}
java itext
1个回答
0
投票

我认为您需要做的是编写一个对S3存储桶上的文件起作用并执行以下步骤的Java客户端:

  • 从S3下载所需的文件。
  • 从文件中提取附件。
  • 将生成的文件上传回S3。

执行上述步骤的示例代码如下:

import java.io.*;
import java.util.Set;
import com.amazonaws.services.s3.*;
import com.amazonaws.services.s3.model.*;
import com.itextpdf.text.pdf.*;

public class S3PDFAttachmentExtractor {

    public static void main(String[] args) throws IOException {

    // download file from S3
    AmazonS3Client amazonS3Client = new AmazonS3Client();
    S3Object object = amazonS3Client.getObject("<yours3location>", "fileKey");

    // write the file content to a local file.
    S3ObjectInputStream objectContent = object.getObjectContent();
    FileOutputStream out = new FileOutputStream("tempOutputFile.pdf");
    writeToFile(objectContent, out);

    // Extract attachment from the downloaded file.
    extractAttachment("tempOutputFile.pdf", "tempAttachement.xml");

    //upload the attachment
    uploadFile("<s3bucket.fully.qualified.name>", "tempAttachement.xml", "attachementNameOnS3.xml");

    }

    private static void writeToFile(InputStream input, FileOutputStream out) throws IOException {
    // Read the text input stream one line at a time and display each line.
    try (BufferedInputStream in = new BufferedInputStream(input);) {
        byte[] chunk = new byte[1024];
        while (in.read(chunk) > 0) {
        out.write(chunk);
        }
    } finally {
        input.close();
    }
    }

    public static void extractAttachment(final String strInputPath, final String strOutputPath) {
    try {
        final PdfReader pdfReader = new PdfReader(strInputPath);
        final PdfDictionary catalog = pdfReader.getCatalog();
        final PdfDictionary names = catalog.getAsDict(PdfName.NAMES);
        final PdfDictionary embeddedFiles = names.getAsDict(PdfName.EMBEDDEDFILES);
        final PdfArray embeddedFilesArray = embeddedFiles.getAsArray(PdfName.NAMES);
        for (int i = 0; i < embeddedFilesArray.size(); ++i) {
        final PdfDictionary FileSpec = embeddedFilesArray.getAsDict(i);
        if (FileSpec != null) {
            String strFileName = FileSpec.getAsString(PdfName.F).toString();
            System.out.println(strFileName);
            if (strFileName.endsWith(".xml")) {
            strFileName = String.valueOf(System.currentTimeMillis()) + ".xml";
            extractFiles(pdfReader, FileSpec, String.valueOf(strOutputPath) + strFileName);
            }
        }
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    }

    private static void extractFiles(final PdfReader pdfReader, final PdfDictionary filespec, final String strFileName) {
    final PdfDictionary refs = filespec.getAsDict(PdfName.EF);
    PRStream prStream = null;
    FileOutputStream outputStream = null;
    final Set<PdfName> keys = (Set<PdfName>) refs.getKeys();
    try {
        for (final PdfName key : keys) {
        prStream = (PRStream) PdfReader.getPdfObject((PdfObject) refs.getAsIndirectObject(key));
        outputStream = new FileOutputStream(new File(strFileName));
        outputStream.write(PdfReader.getStreamBytes(prStream));
        outputStream.flush();
        outputStream.close();
        }
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e2) {
        e2.printStackTrace();
    } finally {
        try {
        if (outputStream != null) {
            outputStream.close();
        }
        } catch (IOException e3) {
        e3.printStackTrace();
        }
    }
    try {
        if (outputStream != null) {
        outputStream.close();
        }
    } catch (IOException e3) {
        e3.printStackTrace();
    }
    }

    private static void uploadFile(String bucketFullPath, String fileLocation, String fileName) throws IOException {
    AmazonS3Client amazonS3Client = new AmazonS3Client();
    InputStream bis = new FileInputStream(fileLocation);
    ObjectMetadata objectMetadata = new ObjectMetadata();
    objectMetadata.setContentType("application/xml");
    amazonS3Client.putObject(bucketFullPath, fileName, bis, objectMetadata);
    }
}

请注意进行这种处理的更好方法是使用上述代码用Java编写AWS Lambda function。由于可以轻松配置AWS Lambada来处理S3存储中的事件,因此在S3存储桶中写入或修改文件时,将自动调用您的代码。有关更多详细信息,请检查AWS Lambda Documentation

© www.soinside.com 2019 - 2024. All rights reserved.