我正在使用下面的iText Java代码从PDF文件中提取附件。在本地系统上可以正常工作。它从PDF提取XML文件并存储在strOutputPath上。我想在AWS S3上执行此操作。 PDF文件将在S3上,并且附件应在S3上提取。在这种情况下,如何在S3上使用文件的绝对路径。我使用了s3client.getUrl()。toExternalForm();,但收到HTTP 403错误。
import java.util.Iterator;
import java.util.Set;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.File;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.PdfArray;
import com.itextpdf.text.pdf.PdfDictionary;
import java.io.IOException;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
public class app
{
public static void main(final String[] args) {
try {
final String strInputPath = args[0];
final String strOutputPath = args[1];
final PdfReader pdfReader = new PdfReader(strInputPath);
final PdfDictionary catalog = pdfReader.getCatalog();
final PdfDictionary names = catalog.getAsDict(PdfName.NAMES);
final PdfDictionary embeddedFiles = names.getAsDict(PdfName.EMBEDDEDFILES);
final PdfArray embeddedFilesArray = embeddedFiles.getAsArray(PdfName.NAMES);
for (int i = 0; i < embeddedFilesArray.size(); ++i) {
final PdfDictionary FileSpec = embeddedFilesArray.getAsDict(i);
if (FileSpec != null) {
String strFileName = FileSpec.getAsString(PdfName.F).toString();
System.out.println(strFileName);
if (strFileName.endsWith(".xml")) {
strFileName = String.valueOf(System.currentTimeMillis()) + ".xml";
extractFiles(pdfReader, FileSpec, String.valueOf(strOutputPath) + strFileName);
}
}
}
}
catch (IOException e) {
e.printStackTrace();
}
}
private static void extractFiles(final PdfReader pdfReader, final PdfDictionary filespec, final String strFileName) {
final PdfDictionary refs = filespec.getAsDict(PdfName.EF);
PRStream prStream = null;
FileOutputStream outputStream = null;
final Set<PdfName> keys = (Set<PdfName>)refs.getKeys();
try {
for (final PdfName key : keys) {
prStream = (PRStream)PdfReader.getPdfObject((PdfObject)refs.getAsIndirectObject(key));
outputStream = new FileOutputStream(new File(strFileName));
outputStream.write(PdfReader.getStreamBytes(prStream));
outputStream.flush();
outputStream.close();
}
}
catch (FileNotFoundException e) {
e.printStackTrace();
}
catch (IOException e2) {
e2.printStackTrace();
}
finally {
try {
if (outputStream != null) {
outputStream.close();
}
}
catch (IOException e3) {
e3.printStackTrace();
}
}
try {
if (outputStream != null) {
outputStream.close();
}
}
catch (IOException e3) {
e3.printStackTrace();
}
}
}
我认为您需要做的是编写一个对S3存储桶上的文件起作用并执行以下步骤的Java客户端:
执行上述步骤的示例代码如下:
import java.io.*;
import java.util.Set;
import com.amazonaws.services.s3.*;
import com.amazonaws.services.s3.model.*;
import com.itextpdf.text.pdf.*;
public class S3PDFAttachmentExtractor {
public static void main(String[] args) throws IOException {
// download file from S3
AmazonS3Client amazonS3Client = new AmazonS3Client();
S3Object object = amazonS3Client.getObject("<yours3location>", "fileKey");
// write the file content to a local file.
S3ObjectInputStream objectContent = object.getObjectContent();
FileOutputStream out = new FileOutputStream("tempOutputFile.pdf");
writeToFile(objectContent, out);
// Extract attachment from the downloaded file.
extractAttachment("tempOutputFile.pdf", "tempAttachement.xml");
//upload the attachment
uploadFile("<s3bucket.fully.qualified.name>", "tempAttachement.xml", "attachementNameOnS3.xml");
}
private static void writeToFile(InputStream input, FileOutputStream out) throws IOException {
// Read the text input stream one line at a time and display each line.
try (BufferedInputStream in = new BufferedInputStream(input);) {
byte[] chunk = new byte[1024];
while (in.read(chunk) > 0) {
out.write(chunk);
}
} finally {
input.close();
}
}
public static void extractAttachment(final String strInputPath, final String strOutputPath) {
try {
final PdfReader pdfReader = new PdfReader(strInputPath);
final PdfDictionary catalog = pdfReader.getCatalog();
final PdfDictionary names = catalog.getAsDict(PdfName.NAMES);
final PdfDictionary embeddedFiles = names.getAsDict(PdfName.EMBEDDEDFILES);
final PdfArray embeddedFilesArray = embeddedFiles.getAsArray(PdfName.NAMES);
for (int i = 0; i < embeddedFilesArray.size(); ++i) {
final PdfDictionary FileSpec = embeddedFilesArray.getAsDict(i);
if (FileSpec != null) {
String strFileName = FileSpec.getAsString(PdfName.F).toString();
System.out.println(strFileName);
if (strFileName.endsWith(".xml")) {
strFileName = String.valueOf(System.currentTimeMillis()) + ".xml";
extractFiles(pdfReader, FileSpec, String.valueOf(strOutputPath) + strFileName);
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
private static void extractFiles(final PdfReader pdfReader, final PdfDictionary filespec, final String strFileName) {
final PdfDictionary refs = filespec.getAsDict(PdfName.EF);
PRStream prStream = null;
FileOutputStream outputStream = null;
final Set<PdfName> keys = (Set<PdfName>) refs.getKeys();
try {
for (final PdfName key : keys) {
prStream = (PRStream) PdfReader.getPdfObject((PdfObject) refs.getAsIndirectObject(key));
outputStream = new FileOutputStream(new File(strFileName));
outputStream.write(PdfReader.getStreamBytes(prStream));
outputStream.flush();
outputStream.close();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e2) {
e2.printStackTrace();
} finally {
try {
if (outputStream != null) {
outputStream.close();
}
} catch (IOException e3) {
e3.printStackTrace();
}
}
try {
if (outputStream != null) {
outputStream.close();
}
} catch (IOException e3) {
e3.printStackTrace();
}
}
private static void uploadFile(String bucketFullPath, String fileLocation, String fileName) throws IOException {
AmazonS3Client amazonS3Client = new AmazonS3Client();
InputStream bis = new FileInputStream(fileLocation);
ObjectMetadata objectMetadata = new ObjectMetadata();
objectMetadata.setContentType("application/xml");
amazonS3Client.putObject(bucketFullPath, fileName, bis, objectMetadata);
}
}
请注意进行这种处理的更好方法是使用上述代码用Java编写AWS Lambda function。由于可以轻松配置AWS Lambada来处理S3存储中的事件,因此在S3存储桶中写入或修改文件时,将自动调用您的代码。有关更多详细信息,请检查AWS Lambda Documentation