我想通过用较低分辨率的图像替换高分辨率图像来减小pdf文件的大小。要完成此问题,我必须:
- 从pdf提取图像(流)
- 压缩图像
- 用压缩图像替换pdf中的图像(流)>>
[当我提取png图像并替换它们时,透明背景变为黑色背景。我从pdf中提取图像以找出原因。 pdf用于保存png的流非常奇怪。因此,如果我尝试从pdf中提取png图像,则会得到两个不同的图像:一个8位彩色图像和一个24位彩色图像。
...
1 0 obj
<</Type/XObject/Subtype/Image/Width 1920/Height 1035/Length 24720/ColorSpace/DeviceGray/BitsPerComponent 8/Filter/FlateDecode>>stream
...
endstream
endobj
2 0 obj
<</Type/XObject/Subtype/Image/Width 1920/Height 1035/SMask 1 0 R/Length 47751/ColorSpace[/CalRGB<</Gamma[2.2 2.2 2.2]/Matrix[0.41239 0.21264 0.01933 0.35758 0.71517 0.11919 0.18045 0.07218 0.9504]/WhitePoint[0.95043 1 1.09]>>]/Intent/Perceptual/BitsPerComponent 8/Filter/FlateDecode>>stream
...
endstream
...
原始图像(具有透明背景的32位彩色图像):“>
8位彩色图像:“>
[24位彩色图像:“>
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.12</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.16</version>
</dependency>
[ImageExtractor
将帮助您从Pdf文件中提取图像。
public class ImageExtractor {
private static final Logger log = LoggerFactory.getLogger(ImageExtractor.class);
public void extract(File pdf, File imageDir) throws IOException {
if(!imageDir.exists()) {
imageDir.mkdirs();
}
PDDocument document = PDDocument.load(pdf);
PDPageTree list = document.getPages();
System.out.println("PDPageTree#count: " + list.getCount());
int pageIndex = 1;
for (PDPage page : list) {
PDResources pdResources = page.getResources();
System.out.println(pdResources.toString());
for (COSName c : pdResources.getXObjectNames()) {
System.out.println("PDResources[" + pageIndex + "]#COSName: " + c.getName());
PDXObject o = pdResources.getXObject(c);
System.out.println("PDResources[" + pageIndex + "]#PDXObject: " + o.toString());
// https://github.com/mkl-public/testarea-itext5/blob/master/src/test/java/mkl/testarea/itext5/extract/ImageExtraction.java
if (o instanceof PDImageXObject) {
PDImageXObject img = (PDImageXObject) o;
File file = new File(imageDir, pageIndex + "-" + System.nanoTime() + "." + img.getSuffix());
ImageIO.write(((PDImageXObject)o).getImage(), img.getSuffix(), file);
}
}
pageIndex ++;
}
log.info("Images have been extracted successfully! Check your images folder.");
}
}
ReplaceHightResolutionImage
是我用来减小pdf大小的代码。
package io.gitlab.donespeak.tutorial.pdf.reducesize.itext;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfNumber;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfStamper;
import com.itextpdf.text.pdf.PdfStream;
import com.itextpdf.text.pdf.parser.PdfImageObject;
import io.gitlab.donespeak.tutorial.pdf.reducesize.imagecompress.ImageCompressor;
import io.gitlab.donespeak.tutorial.pdf.reducesize.imagecompress.SimpleCompress;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
public class ReplaceHightResolutionImage {
private ImageCompressor compressor;
private double quality;
private double scale;
public ReplaceHightResolutionImage(double quality, double scale) {
this.compressor = new SimpleCompress();
this.quality = quality;
this.scale = scale;
}
public ReplaceHightResolutionImage(double quality, double scale, ImageCompressor compressor) {
this.compressor = compressor;
this.quality = quality;
this.scale = scale;
}
public void replace(File pdf, File output) throws IOException, DocumentException {
PdfReader reader = new PdfReader(new FileInputStream(pdf));
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
stream = findImageStream(object);
if (stream == null) {
continue;
}
PdfImageObject pdfImageObject = new PdfImageObject(stream);
BufferedImage bi = pdfImageObject.getBufferedImage();
if (bi == null) {
continue;
}
System.out.println("PdfReader#Xref: " + i + "," + pdfImageObject.getFileType());
BufferedImage resultImage = compressor.compress(bi, pdfImageObject.getFileType(), quality, scale);
replaceImage(stream, resultImage);
}
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(output));
// furtherCompress(reader, stamper);
stamper.close();
}
private void furtherCompress(PdfReader reader, PdfStamper stamper) throws DocumentException {
reader.removeFields();
reader.removeUnusedObjects();
stamper.setFullCompression();
stamper.getWriter().setCompressionLevel(PdfStream.DEFAULT_COMPRESSION);
}
private PRStream findImageStream(PdfObject object) {
PRStream stream;
if (object == null || !object.isStream()) {
return null;
}
stream = (PRStream)object;
System.out.println(stream.getAsName(PdfName.SUBTYPE));
if (!PdfName.IMAGE.equals(stream.getAsName(PdfName.SUBTYPE))) {
// not jpg or png
return null;
}
PdfName pdfName = stream.getAsName(PdfName.FILTER);
if (!PdfName.DCTDECODE.equals(pdfName) && !PdfName.FLATEDECODE.equals(pdfName)) {
return null;
}
// if (PdfName.DCTDECODE.equals(filter)) {
// return PdfImageObject.ImageBytesType.JPG.getFileExtension();
// } else if (PdfName.JPXDECODE.equals(filter)) {
// return PdfImageObject.ImageBytesType.JP2.getFileExtension();
// } else if (PdfName.FLATEDECODE.equals(filter)) {
// return PdfImageObject.ImageBytesType.PNG.getFileExtension();
// } else if (PdfName.LZWDECODE.equals(filter)) {
// return PdfImageObject.ImageBytesType.CCITT.getFileExtension();
// }
return stream;
}
private void replaceImage(PRStream stream, BufferedImage resultImage) throws IOException {
ByteArrayOutputStream imgBytes = new ByteArrayOutputStream();
ImageIO.write(resultImage, "JPG", imgBytes);
stream.clear();
stream.setData(imgBytes.toByteArray(), false, PRStream.NO_COMPRESSION);
stream.put(PdfName.TYPE, PdfName.XOBJECT);
stream.put(PdfName.SUBTYPE, PdfName.IMAGE);
stream.put(PdfName.FILTER, PdfName.DCTDECODE);
stream.put(PdfName.WIDTH, new PdfNumber(resultImage.getWidth()));
stream.put(PdfName.HEIGHT, new PdfNumber(resultImage.getHeight()));
stream.put(PdfName.BITSPERCOMPONENT, new PdfNumber(8));
stream.put(PdfName.COLORSPACE, PdfName.DEVICERGB);
}
}
package io.gitlab.donespeak.tutorial.pdf.reducesize.itext;
public class ThumbnailatorCompressor implements ImageCompressor {
@Override
public BufferedImage compress(BufferedImage image, String imageFormat, double quality, double scale) throws IOException {
System.out.println("ThumbnailatorCompressor#type: " + image.getType());
// int imageType = "png".equalsIgnoreCase(imageFormat)? BufferedImage.TYPE_INT_ARGB: image.getType();
BufferedImage thumbnail = Thumbnails.of(image)
.imageType(image.getType())
.scale(scale)
.outputQuality(quality)
// .outputFormat(imageFormat)
.useOriginalFormat()
.asBufferedImage();
return thumbnail;
}
}
- horse.pdf
- horse.png
public class ReplaceHightResolutionImageTest {
@Test
public void reduceWithThumbnailatorCompressor() throws IOException, DocumentException {
double quality = 1d;
double scale = 0.6d;
File pdf = new File("pdf/asset/horse.pdf");
File output = new File("pdf/target/output", "replaced-" + quality + "-" + scale);
ReplaceHightResolutionImage replacer = new ReplaceHightResolutionImage(quality, scale, new SimpleCompress());
replacer.replace(pdf, output);
}
}
我想通过用较低分辨率的图像替换高分辨率图像来减小pdf文件的大小。要解决此问题,我必须:从pdf提取图像(流)压缩图像...
这是一个可行但不够好的答案。它可以很好地压缩jpg和png。唯一的缺点是,如果您在多个页面中重复使用图像,它将把每个图像引用当作一个单独的流,并产生一个新的流来代替图像引用,这可能会导致更大的文件大小。
1 0 obj
<</Type/XObject/Subtype/Image/Width 1002/Height 564/Filter/DCTDecode/ColorSpace/DeviceRGB/BitsPerComponent 8/Length 89149>>stream
...
endstream
endobj
2 0 obj
<</Length 106/Filter/FlateDecode>>stream
x�m�=� ��w�^@|���=� 7�/����8�6��&b0$��
��N!o��L�,?Ck'�����c�h�x0��/(5c*�Y�سEX�o�Uj3�B�ݔ"
endstream
endobj
4 0 obj
<</Type/Page/MediaBox[0 0 595 842]/Resources<</XObject<</img0 1 0 R>>>>/Contents 2 0 R/Parent 3 0 R>>
endobj
5 0 obj
<</Length 106/Filter/FlateDecode>>stream
x�m�=� ��w�^@|���=�image 7�/����8�6��&b0$��
��N!o��L�,?Ck'�����c�h�x0��/(5c*�Y�سEX�o�Uj3�B�ݔ"
endstream
endobj
6 0 obj
<</Type/Page/MediaBox[0 0 595 842]/Resources<</XObject<</img0 1 0 R>>>>/Contents 5 0 R/Parent 3 0 R>>
endobj
package io.gitlab.donespeak.tutorial.pdf.reducesize;
import io.gitlab.donespeak.tutorial.pdf.reducesize.imagecompress.ThumbnailatorCompressor;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
public class RemoveAllImageFromPdf {
public static void extractImages(File input, File imageDir) throws IOException {
if(imageDir.exists()) {
imageDir.delete();
}
imageDir.mkdirs();
PDDocument document = PDDocument.load(input);
int pageIndex = 1;
PDDocumentCatalog catalog = document.getDocumentCatalog();
for (PDPage page : catalog.getPages()) {
PDResources pdResources = page.getResources();
System.out.println(pdResources.toString());
for (COSName c : pdResources.getXObjectNames()) {
System.out.println("PDResources[" + pageIndex + "]#COSName: " + c.getName());
PDXObject o = pdResources.getXObject(c);
System.out.println("PDResources[" + pageIndex + "]#PDXObject: " + o.toString());
// https://github.com/mkl-public/testarea-itext5/blob/master/src/test/java/mkl/testarea/itext5/extract/ImageExtraction.java
if (o instanceof PDImageXObject) {
PDImageXObject img = (PDImageXObject) o;
System.out.println(img.getSuffix() + "-" + img.getBitsPerComponent() + "-" + img.getColorSpace());
File file = new File(imageDir, pageIndex + "-" + c.getName() + "-" + img.getColorSpace() + "-" + System.nanoTime() + "." + img.getSuffix());
ImageIO.write(((PDImageXObject)o).getImage(), img.getSuffix(), file);
}
}
pageIndex ++;
}
// document.save(output);
}
/**
*
* @param input
* @param output
* @throws IOException
*/
public static void compress(File input, File output) throws IOException {
if(!output.getParentFile().exists()) {
output.getParentFile().mkdirs();
}
ThumbnailatorCompressor compressor = new ThumbnailatorCompressor();
PDDocument document = PDDocument.load(input);
int pageIndex = 1;
PDDocumentCatalog catalog = document.getDocumentCatalog();
for (PDPage page : catalog.getPages()) {
PDResources pdResources = page.getResources();
for (COSName c : pdResources.getXObjectNames()) {
System.out.println("PDResources[" + pageIndex + "]#COSName: " + c.getName());
PDXObject o = pdResources.getXObject(c);
System.out.println("PDResources[" + pageIndex + "]#PDXObject: " + o.toString());
// https://github.com/mkl-public/testarea-itext5/blob/master/src/test/java/mkl/testarea/itext5/extract/ImageExtraction.java
if (o instanceof PDImageXObject) {
PDImageXObject img = (PDImageXObject) o;
BufferedImage bufferedImage = compressor.compress(img.getImage(), img.getSuffix(), 0.8, 0.5);
PDImageXObject imgNew = null;
System.out.println("img(w, h): (" + img.getWidth() + "," + img.getHeight() + ")");
System.out.println("bufferedImage(w, h): (" + bufferedImage.getWidth() + "," + bufferedImage.getHeight() + ")");
if("png".equalsIgnoreCase(img.getSuffix())) {
imgNew = LosslessFactory.createFromImage(document, bufferedImage);
} else {
imgNew = JPEGFactory.createFromImage(document, bufferedImage);
}
pdResources.put(c, imgNew);
}
}
pageIndex ++;
}
if(!output.getParentFile().exists()) {
output.getParentFile().mkdirs();
}
document.save(output);
document.close();
}
}
通过使用以下方法直接处理文档中的对象,也许我们可以解决上述问题。但是我不知道如何以这种方式替换流。
new com.itextpdf.text.pdf.PdfReader(new FileInputStream(pdf)).getPdfObject(i);
// or
org.apache.pdfbox.pdmodel.PDDocument.load(pdf).getDocument().getObjects()