如何使用 iText 将带有图像和超链接的 HTML 转换为 PDF?

问题描述 投票:0回答:2

我正在尝试在同时使用

MVC
web 表单
pdf itext html-parsing html-agility-pack xmlworker
2个回答
12
投票
XMLWorker

仅理解绝对URI,因此所描述的问题是预期行为。如果没有一些附加信息,解析器无法自动推断出 URI 方案 或路径。 

实现 

ILinkProvider

修复了损坏的超链接问题,并实现 IImageProvider 修复了损坏的图像问题。由于两种实现都必须执行 URI 解析,这是第一步。下面的帮助器类就做到了这一点,并且还尝试使 Web (ASP.NET) 上下文调用(示例如下)尽可能简单:


// resolve URIs for LinkProvider & ImageProvider public class UriHelper { /* IsLocal; when running in web context: * [1] give LinkProvider http[s] scheme; see CreateBase(string baseUri) * [2] give ImageProvider relative path starting with '/' - see: * Join(string relativeUri) */ public bool IsLocal { get; set; } public HttpContext HttpContext { get; private set; } public Uri BaseUri { get; private set; } public UriHelper(string baseUri) : this(baseUri, true) {} public UriHelper(string baseUri, bool isLocal) { IsLocal = isLocal; HttpContext = HttpContext.Current; BaseUri = CreateBase(baseUri); } /* get URI for IImageProvider to instantiate iTextSharp.text.Image for * each <img> element in the HTML. */ public string Combine(string relativeUri) { /* when running in a web context, the HTML is coming from a MVC view * or web form, so convert the incoming URI to a **local** path */ if (HttpContext != null && !BaseUri.IsAbsoluteUri && IsLocal) { return HttpContext.Server.MapPath( // Combine() checks directory traversal exploits VirtualPathUtility.Combine(BaseUri.ToString(), relativeUri) ); } return BaseUri.Scheme == Uri.UriSchemeFile ? Path.Combine(BaseUri.LocalPath, relativeUri) // for this example we're assuming URI.Scheme is http[s] : new Uri(BaseUri, relativeUri).AbsoluteUri; } private Uri CreateBase(string baseUri) { if (HttpContext != null) { // running on a web server; need to update original value var req = HttpContext.Request; baseUri = IsLocal // IImageProvider; absolute virtual path (starts with '/') // used to convert to local file system path. see: // Combine(string relativeUri) ? req.ApplicationPath // ILinkProvider; absolute http[s] URI scheme : req.Url.GetLeftPart(UriPartial.Authority) + HttpContext.Request.ApplicationPath; } Uri uri; if (Uri.TryCreate(baseUri, UriKind.RelativeOrAbsolute, out uri)) return uri; throw new InvalidOperationException("cannot create a valid BaseUri"); } }

实现 
ILinkProvider

非常简单,因为

UriHelper
给出了基本 URI。我们只需要正确的 URI 方案(
file
http[s]
):

// make hyperlinks with relative URLs absolute public class LinkProvider : ILinkProvider { // rfc1738 - file URI scheme section 3.10 public const char SEPARATOR = '/'; public string BaseUrl { get; private set; } public LinkProvider(UriHelper uriHelper) { var uri = uriHelper.BaseUri; /* simplified implementation that only takes into account: * Uri.UriSchemeFile || Uri.UriSchemeHttp || Uri.UriSchemeHttps */ BaseUrl = uri.Scheme == Uri.UriSchemeFile // need trailing separator or file paths break ? uri.AbsoluteUri.TrimEnd(SEPARATOR) + SEPARATOR // assumes Uri.UriSchemeHttp || Uri.UriSchemeHttps : BaseUrl = uri.AbsoluteUri; } public string GetLinkRoot() { return BaseUrl; } }

IImageProvider

需要
实现单个方法,Retrieve(string src),但是
Store(string src, Image img)
很简单 - 请注意那里的内联注释和
GetImageRootPath()

// handle <img> elements in HTML public class ImageProvider : IImageProvider { private UriHelper _uriHelper; // see Store(string src, Image img) private Dictionary<string, Image> _imageCache = new Dictionary<string, Image>(); public virtual float ScalePercent { get; set; } public virtual Regex Base64 { get; set; } public ImageProvider(UriHelper uriHelper) : this(uriHelper, 67f) { } // hard-coded based on general past experience ^^^ // but call the overload to supply your own public ImageProvider(UriHelper uriHelper, float scalePercent) { _uriHelper = uriHelper; ScalePercent = scalePercent; Base64 = new Regex( // rfc2045, section 6.8 (alphabet/padding) @"^data:image/[^;]+;base64,(?<data>[a-z0-9+/]+={0,2})$", RegexOptions.Compiled | RegexOptions.IgnoreCase ); } public virtual Image ScaleImage(Image img) { img.ScalePercent(ScalePercent); return img; } public virtual Image Retrieve(string src) { if (_imageCache.ContainsKey(src)) return _imageCache[src]; try { if (Regex.IsMatch(src, "^https?://", RegexOptions.IgnoreCase)) { return ScaleImage(Image.GetInstance(src)); } Match match; if ((match = Base64.Match(src)).Length > 0) { return ScaleImage(Image.GetInstance( Convert.FromBase64String(match.Groups["data"].Value) )); } var imgPath = _uriHelper.Combine(src); return ScaleImage(Image.GetInstance(imgPath)); } // not implemented to keep the SO answer (relatively) short catch (BadElementException ex) { return null; } catch (IOException ex) { return null; } catch (Exception ex) { return null; } } /* * always called after Retrieve(string src): * [1] cache any duplicate <img> in the HTML source so the image bytes * are only written to the PDF **once**, which reduces the * resulting file size. * [2] the cache can also **potentially** save network IO if you're * running the parser in a loop, since Image.GetInstance() creates * a WebRequest when an image resides on a remote server. couldn't * find a CachePolicy in the source code */ public virtual void Store(string src, Image img) { if (!_imageCache.ContainsKey(src)) _imageCache.Add(src, img); } /* XMLWorker documentation for ImageProvider recommends implementing * GetImageRootPath(): * * http://demo.itextsupport.com/xmlworker/itextdoc/flatsite.html#itextdoc-menu-10 * * but a quick run through the debugger never hits the breakpoint, so * not sure if I'm missing something, or something has changed internally * with XMLWorker.... */ public virtual string GetImageRootPath() { return null; } public virtual void Reset() { } }

基于 
XML Worker 文档

,将上面 ILinkProvider

IImageProvider
的实现挂接到一个简单的解析器类中非常简单:

/* a simple parser that uses XMLWorker and XMLParser to handle converting * (most) images and hyperlinks internally */ public class SimpleParser { public virtual ILinkProvider LinkProvider { get; set; } public virtual IImageProvider ImageProvider { get; set; } public virtual HtmlPipelineContext HtmlPipelineContext { get; set; } public virtual ITagProcessorFactory TagProcessorFactory { get; set; } public virtual ICSSResolver CssResolver { get; set; } /* overloads simplfied to keep SO answer (relatively) short. if needed * set LinkProvider/ImageProvider after instantiating SimpleParser() * to override the defaults (e.g. ImageProvider.ScalePercent) */ public SimpleParser() : this(null) { } public SimpleParser(string baseUri) { LinkProvider = new LinkProvider(new UriHelper(baseUri, false)); ImageProvider = new ImageProvider(new UriHelper(baseUri, true)); HtmlPipelineContext = new HtmlPipelineContext(null); // another story altogether, and not implemented for simplicity TagProcessorFactory = Tags.GetHtmlTagProcessorFactory(); CssResolver = XMLWorkerHelper.GetInstance().GetDefaultCssResolver(true); } /* * when sending XHR via any of the popular JavaScript frameworks, * <img> tags are **NOT** always closed, which results in the * infamous iTextSharp.tool.xml.exceptions.RuntimeWorkerException: * 'Invalid nested tag a found, expected closing tag img.' a simple * workaround. */ public virtual string SimpleAjaxImgFix(string xHtml) { return Regex.Replace( xHtml, "(?<image><img[^>]+)(?<=[^/])>", new MatchEvaluator(match => match.Groups["image"].Value + " />"), RegexOptions.IgnoreCase | RegexOptions.Multiline ); } public virtual void Parse(Stream stream, string xHtml) { xHtml = SimpleAjaxImgFix(xHtml); using (var stringReader = new StringReader(xHtml)) { using (Document document = new Document()) { PdfWriter writer = PdfWriter.GetInstance(document, stream); document.Open(); HtmlPipelineContext .SetTagFactory(Tags.GetHtmlTagProcessorFactory()) .SetLinkProvider(LinkProvider) .SetImageProvider(ImageProvider) ; var pdfWriterPipeline = new PdfWriterPipeline(document, writer); var htmlPipeline = new HtmlPipeline(HtmlPipelineContext, pdfWriterPipeline); var cssResolverPipeline = new CssResolverPipeline(CssResolver, htmlPipeline); XMLWorker worker = new XMLWorker(cssResolverPipeline, true); XMLParser parser = new XMLParser(worker); parser.Parse(stringReader); } } } }

正如内联注释的那样,
SimpleAjaxImgFix(string xHtml)

专门处理

XHR,
may发送未关闭的<img>标签
,这是
valid
HTML,但是
invalid
XML
will
打破XMLWorker。有关如何使用 XHR 和 iTextSharp 接收 PDF 或其他二进制数据的简单说明和实现
可以在此处找到

Regex

中使用了

SimpleAjaxImgFix(string xHtml)
,这样任何使用(
复制/粘贴
?)代码的人都不需要添加另一个 nuget 包,而是像
HtmlAgilityPack
那样的
HTML
解析器应该被使用,因为它变成了这样: <div><img src='a.gif'><br><hr></div>

进入这个:

<div><img src='a.gif' /><br /><hr /></div>

只需几行代码:

var hDocument = new HtmlDocument() { OptionWriteEmptyNodes = true, OptionAutoCloseOnEnd = true }; hDocument.LoadHtml("<div><img src='a.gif'><br><hr></div>"); var closedTags = hDocument.DocumentNode.WriteTo();

还值得注意 - 使用上面的 
SimpleParser.Parse()

作为

general
蓝图来额外实现自定义 ICSSResolverITagProcessorFactory,这在文档中中有解释。 现在应该注意问题中描述的问题。来自 MVC Action Method

的呼叫:

[HttpPost]  // some browsers have URL length limits
[ValidateInput(false)] // or throws HttpRequestValidationException
public ActionResult Index(string xHtml)
{
    Response.ContentType = "application/pdf";
    Response.AppendHeader(
        "Content-Disposition", "attachment; filename=test.pdf"
    );
    var simpleParser = new SimpleParser();
    simpleParser.Parse(Response.OutputStream, xHtml);

    return new EmptyResult();
}

或从
Web Form

服务器控件

获取 
HTML
:
Response.ContentType = "application/pdf"; Response.AppendHeader("Content-Disposition", "attachment; filename=test.pdf"); using (var stringWriter = new StringWriter()) { using (var htmlWriter = new HtmlTextWriter(stringWriter)) { ConvertControlToPdf.RenderControl(htmlWriter); } var simpleParser = new SimpleParser(); simpleParser.Parse(Response.OutputStream, stringWriter.ToString()); } Response.End();

或者文件系统上带有超链接和图像的简单 HTML 文件:

<h1>HTML Page 00 on Local File System</h1> <div> <div> Relative &lt;img&gt;: <img src='Images/alt-gravatar.png' /> </div> <div> Hyperlink to file system HTML page: <a href='file-system-html-01.html'>Page 01</a> </div> </div>

或来自远程网站的 HTML:

<div> <div> <img width="200" alt="Wikipedia Logo" src="portal/wikipedia.org/assets/img/Wikipedia-logo-v2.png"> </div> <div lang="en"> <a href="https://en.wikipedia.org/">English</a> </div> <div lang="en"> <a href="wiki/IText">iText</a> </div> </div>

以上两个
HTML
片段从控制台应用程序运行:

var filePaths = Path.Combine(basePath, "file-system-html-00.html");
var htmlFile = File.ReadAllText(filePaths);
var remoteUrl = Path.Combine(basePath, "wikipedia.html");
var htmlRemote = File.ReadAllText(remoteUrl);
var outputFile = Path.Combine(basePath, "filePaths.pdf");
var outputRemote = Path.Combine(basePath, "remoteUrl.pdf");

using (var stream = new FileStream(outputFile, FileMode.Create))
{
    var simpleParser = new SimpleParser(basePath);
    simpleParser.Parse(stream, htmlFile);
}
using (var stream = new FileStream(outputRemote, FileMode.Create))
{
    var simpleParser = new SimpleParser("https://wikipedia.org");
    simpleParser.Parse(stream, htmlRemote);
}

相当长的答案,但是看看这里标记为
html

pdf

itextsharp
的问题,截至撰写本文时(2016年2月23日),有
776个结果
与4,063个结果
已标记总数 itextsharp
 -那是 
19%


0
投票

我的问题是将报告 html 中的图像渲染为 pdf。有了你的帖子我就能做到。

我正在使用 ASP.NET MVC 5。

我只需改变

ImageProviderClass

的这个方法:

public virtual string GetImageRootPath() { return null; }

public virtual string GetImageRootPath() { HostingEnvironment.MapPath("~/Content/Images/") }

谢谢!

最新问题
© www.soinside.com 2019 - 2025. All rights reserved.