使用以下代码,我可以从互联网上下载文件的HTML:
WebClient wc = new WebClient();
// ....
string downloadedFile = wc.DownloadString("http://www.myurl.com/");
但是,有时文件包含“有趣”的字符,如é
到é
,←
到â†
和フシギダネ
到フシギダãƒ
。
我认为它可能与不同的unicode类型或某事有关,因为每个角色变成2个新角色,也许每个角色被分成两半,但我对这个领域知之甚少。你觉得怎么了?
这是一个包装下载类,它支持gzip并检查编码头和元标记,以便正确解码。
实例化该类,并调用GetPage()
。
public class HttpDownloader
{
private readonly string _referer;
private readonly string _userAgent;
public Encoding Encoding { get; set; }
public WebHeaderCollection Headers { get; set; }
public Uri Url { get; set; }
public HttpDownloader(string url, string referer, string userAgent)
{
Encoding = Encoding.GetEncoding("ISO-8859-1");
Url = new Uri(url); // verify the uri
_userAgent = userAgent;
_referer = referer;
}
public string GetPage()
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
if (!string.IsNullOrEmpty(_referer))
request.Referer = _referer;
if (!string.IsNullOrEmpty(_userAgent))
request.UserAgent = _userAgent;
request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip,deflate");
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
Headers = response.Headers;
Url = response.ResponseUri;
return ProcessContent(response);
}
}
private string ProcessContent(HttpWebResponse response)
{
SetEncodingFromHeader(response);
Stream s = response.GetResponseStream();
if (response.ContentEncoding.ToLower().Contains("gzip"))
s = new GZipStream(s, CompressionMode.Decompress);
else if (response.ContentEncoding.ToLower().Contains("deflate"))
s = new DeflateStream(s, CompressionMode.Decompress);
MemoryStream memStream = new MemoryStream();
int bytesRead;
byte[] buffer = new byte[0x1000];
for (bytesRead = s.Read(buffer, 0, buffer.Length); bytesRead > 0; bytesRead = s.Read(buffer, 0, buffer.Length))
{
memStream.Write(buffer, 0, bytesRead);
}
s.Close();
string html;
memStream.Position = 0;
using (StreamReader r = new StreamReader(memStream, Encoding))
{
html = r.ReadToEnd().Trim();
html = CheckMetaCharSetAndReEncode(memStream, html);
}
return html;
}
private void SetEncodingFromHeader(HttpWebResponse response)
{
string charset = null;
if (string.IsNullOrEmpty(response.CharacterSet))
{
Match m = Regex.Match(response.ContentType, @";\s*charset\s*=\s*(?<charset>.*)", RegexOptions.IgnoreCase);
if (m.Success)
{
charset = m.Groups["charset"].Value.Trim(new[] { '\'', '"' });
}
}
else
{
charset = response.CharacterSet;
}
if (!string.IsNullOrEmpty(charset))
{
try
{
Encoding = Encoding.GetEncoding(charset);
}
catch (ArgumentException)
{
}
}
}
private string CheckMetaCharSetAndReEncode(Stream memStream, string html)
{
Match m = new Regex(@"<meta\s+.*?charset\s*=\s*""?(?<charset>[A-Za-z0-9_-]+)""?", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
if (m.Success)
{
string charset = m.Groups["charset"].Value.ToLower() ?? "iso-8859-1";
if ((charset == "unicode") || (charset == "utf-16"))
{
charset = "utf-8";
}
try
{
Encoding metaEncoding = Encoding.GetEncoding(charset);
if (Encoding != metaEncoding)
{
memStream.Position = 0L;
StreamReader recodeReader = new StreamReader(memStream, metaEncoding);
html = recodeReader.ReadToEnd().Trim();
recodeReader.Close();
}
}
catch (ArgumentException)
{
}
}
return html;
}
}
由于我不被评论(声誉不足),我将不得不发布一个额外的答案。我经常使用Mikael的优秀课程,但我遇到了试图找到字符集元信息的正则表达式的实际问题。这个
Match m = new Regex(@"<meta\s+.*?charset\s*=\s*(?<charset>[A-Za-z0-9_-]+)", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
失败了
<meta charset="UTF-8"/>
而这个
Match m = new Regex(@"<meta\s+.*?charset\s*=\s*""?(?<charset>[A-Za-z0-9_-]+)""?", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);
才不是。
谢谢,Mikael。
试试这个
string downloadedFile = wc.DownloadString("http://www.myurl.com");
我总是删除最后一个“斜线”,它一直像现在一样有魅力。但我也可能是一种危险