我在从 PDF 中提取图像时遇到问题。我尝试使用 GhostScript,但图像是用像素化牙齿提取的,就像在对象边缘排列一样。我尝试过的任何方法都没有改善它。当增加 DPI 时,提取的图像太大。提取时某些东西会降低图像质量。
我将不胜感激任何人推荐一个软件包来执行此类操作 - 无论是付费还是免费。
有足够多的通用 PDF 库可用于从 PDF 中提取图像。并非所有这些都提供了简单的方法。
作为 Docotic.Pdf 库的开发者之一,我可以推荐它来完成这项任务。
以下示例展示了如何从 PDF 中提取所有图像:
static void ExtractImagesFromPdfPages()
{
string path = "";
using (PdfDocument pdf = new PdfDocument(path))
{
for (int i = 0; i < pdf.Pages.Count; i++)
{
for (int j = 0; j < pdf.Pages[i].Images.Count; j++)
{
string imageName = string.Format("page{0}-image{1}", i, j);
string imagePath = pdf.Pages[i].Images[j].Save(imageName);
}
}
}
}
该库不会对图像重新采样。它将以与 PDF 中完全相同的方式保存它们。
这是一个 .NETFramework C# 控制台应用程序,用于从 pdf 文档中提取并显示所有图像数据。
使用 itext7 v8.0.0 和 photo.exif v1.1.16 nuget 包。
图像对象的检索与 PDF 中的完全相同,无需修改、压缩等。
using iText.Commons.Utils;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Xobject;
using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Linq;
internal class Program
{
static void Main(string[] args)
{
var filePath = "C:\\Path\\To\\Pdf\\Document.pdf";
var fileInfo = new FileInfo(filePath);
WriteHeading($"Extracting Data for: {fileInfo.Name}");
using (var pdfReader = new PdfReader(fileInfo))
using (var pdfDocument = new PdfDocument(pdfReader))
{
var documentInfo = pdfDocument.GetDocumentInfo();
WriteLines(
JsonUtil.SerializeToString(new
{
fileInfo.Name,
fileInfo.FullName,
fileInfo.LastWriteTime,
fileInfo.LastWriteTimeUtc,
Attributes = fileInfo.Attributes.ToString(),
fileInfo.CreationTime,
fileInfo.CreationTimeUtc,
fileInfo.Length,
Author = documentInfo.GetAuthor(),
Creator = documentInfo.GetCreator(),
Keywords = documentInfo.GetKeywords(),
Producer = documentInfo.GetProducer(),
Subject = documentInfo.GetSubject(),
Title = documentInfo.GetTitle(),
NumberOfPages = pdfDocument.GetNumberOfPages(),
NumberOfPdfObjects = pdfDocument.GetNumberOfPdfObjects(),
PdfVersion = pdfDocument
.GetPdfVersion()
.ToPdfName()
.GetValue()
}),
""
);
// List of Images found in the PDF file.
var images = GetAndPrintImageInformation(pdfDocument);
foreach (var image in images)
{
image.Save("C:\\Path\\To\\Save\\To");
}
}
Console.ReadLine();
}
static List<Image> GetAndPrintImageInformation(PdfDocument pdfDocument)
{
var images = new List<Image>();
for (var i = 1; i <= pdfDocument.GetNumberOfPdfObjects(); i++)
{
var pdfObject = pdfDocument.GetPdfObject(i);
if (pdfObject == null)
continue;
if (!pdfObject.IsStream())
continue;
var pdfStream = pdfObject as PdfStream;
if (!pdfStream.ContainsKey(PdfName.Subtype))
continue;
var subType = pdfStream.GetAsName(PdfName.Subtype).GetValue();
if (subType != PdfName.Image.GetValue()
&& subType != PdfName.ImageMask.GetValue()
&& subType != PdfName.StampImage.GetValue()
) continue;
var imageObj = PdfXObject.MakeXObject(pdfStream) as PdfImageXObject;
using (var ms = new MemoryStream(imageObj.GetImageBytes()))
{
var image = Image.FromStream(ms);
images.Add(image);
var data = new photo.exif.Parser().Parse(ms);
var exifDataJson = JsonUtil.SerializeToString(data);
WriteLines(
$"Image Type: {imageObj.IdentifyImageType()}",
$"File Extension: {imageObj.IdentifyImageFileExtension()}",
$"Dimensions: {image.Width}w X {image.Height}h",
$"Tag: {image.Tag}",
$"ExifData: {exifDataJson}",
""
);
}
}
return images;
}
static void WriteHeading(string heading)
{
WriteLines(
heading,
new string(
Enumerable.Range(0, heading.Length)
.Select(x => '=')
.ToArray()
)
);
}
static void WriteLines(params object[] lines)
{
foreach (var line in lines)
{
Console.WriteLine(line);
}
}
}
输出示例:
Extracting Data for: Document.pdf
=================================
{
"Name": "Document.pdf",
"FullName": "C:\\Path\\To\\Pdf\\Document.pdf",
"LastWriteTime": "2020-01-29T14:49:16.9118875+00:00",
"LastWriteTimeUtc": "2020-01-29T14:49:16.9118875Z",
"Attributes": "Archive",
"CreationTime": "2020-01-29T01:06:10.1240743+00:00",
"CreationTimeUtc": "2020-01-29T01:06:10.1240743Z",
"Length": 354600,
"Author": "James",
"Creator": "Microsoftr Word 2010",
"Producer": "Microsoftr Word 2010",
"NumberOfPages": 1,
"NumberOfPdfObjects": 59,
"PdfVersion": "1.5"
}
Image Type: JPEG
File Extension: jpg
Dimensions: 2012w X 622h
Tag:
ExifData: [
{
"Title": "ChrominanceTable",
"Description": "Chrominance table. The luminance table and the ...",
"Id": 20625,
"Length": 128,
"Value": 9
},
{
"Title": "LuminanceTable",
"Description": "Luminance table. The luminance table and the ...",
"Id": 20624,
"Length": 128,
"Value": 8
}
]
Image Type: JPEG
File Extension: jpg
Dimensions: 480w X 640h
Tag:
ExifData: [
{
"Title": "ChrominanceTable",
"Description": "Chrominance table. The luminance table and the ...",
"Id": 20625,
"Length": 128,
"Value": 9
},
{
"Title": "LuminanceTable",
"Description": "Luminance table. The luminance table and the ...",
"Id": 20624,
"Length": 128,
"Value": 8
}
]
Image Type: JPEG
File Extension: jpg
Dimensions: 370w X 165h
Tag:
ExifData: [
{
"Title": "LuminanceTable",
"Description": "Luminance table. The luminance table and the ...",
"Id": 20624,
"Length": 128,
"Value": 8
}
]
.NET Core 版本的 iTextSharp 不完整。我尝试了几个小时但没有成功。我使用免费软件包 PdfPig 取得了成功。
public static void ExtractImagesWithPdfPig(byte[] pdfBytes)
{
using var document = PdfDocument.Open(pdfBytes);
foreach (var page in document.GetPages())
foreach (var pdfImage in page.GetImages())
{
var bytes = TryGetImage(pdfImage);
using var mem = new MemoryStream(bytes);
Image img;
try
{
// Ensure data is valid
img = Image.FromStream(mem);
}
catch (Exception) { continue; }
// TODO : Do what you want with the image or bytes
// Free code : get the ideal file extension
var codec = ImageCodecInfo.GetImageDecoders().First(c => c.FormatID == img.RawFormat.Guid);
var extension = codec.FilenameExtension.Split(';').First().TrimStart('*', '.').ToLower();
}
byte[] TryGetImage(IPdfImage image)
{
if (image.TryGetPng(out var bytes))
return bytes;
if (image.TryGetBytes(out var iroBytes))
return iroBytes.ToArray();
return image.RawBytes.ToArray();
}
}