我有一张包含一些URL的Excel表格,我正在尝试使用Fortigate的Web过滤器编写一个填充网站类别的宏。因此,例如,工作表上的单元格可能包含“www.google.com”,并且该网站会将其归类为URL https://fortiguard.com/webfilter?q=google.com中的“搜索引擎和门户网站”。
我正在努力解析HTML以获得类别。 HTML看起来像:
<DIV class=sidebar-content>
<H4>WF Rating History</H4>
<P><SPAN style="FONT-SIZE: 10px"><EM>Jun 10th, 2008 @ 17:45:24 PDT</EM></SPAN><BR>added as <STRONG>Search Engines and Portals</STRONG></P>
<DIV><A href="about://forticlient.com/" target=_blank><IMG src="about:/static/images/forticlient_share_button.png?v=5"></A> </DIV></DIV>
和我以前得到的代码在下面(我只是尝试使用一个网站,并假设它位于工作表上的A1):
Sub siteCatgories()
Dim xhr As MSXML2.XMLHTTP60
Dim doc As MSHTML.HTMLDocument
Set xhr = New MSXML2.XMLHTTP60
Url = Cells(1, 1).Value
With xhr
.Open "GET", "https://fortiguard.com/webfilter?q=" & Url, False
.send
If .readyState = 4 And .Status = 200 Then
Set doc = New MSHTML.HTMLDocument
doc.body.innerHTML = .responseText
End If
End With
'retrieve relevant HTML
Debug.Print doc.getElementsByClassName("sidebar-content").toString
End Sub
上面的debug语句只返回[Object]。任何有关如何在VBA中解析网站的HTML的帮助将不胜感激!
这种简单的解析可以使用Split()
而不是HTMLDocument
来完成:
Sub Test()
MsgBox Join(GetData("google.com"), vbCrLf)
End Sub
Function GetData(sUrl)
Dim tmp
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://fortiguard.com/webfilter?q=" & sUrl, False, "u051772", "fy17janr"
.Send
tmp = .ResponseText
End With
tmp = Split(tmp, "WF Rating History", 2)(1)
tmp = Split(tmp, "<em>", 2)(1)
tmp = Split(tmp, "</strong>", 2)(0)
tmp = Split(tmp, "</em>", 2)
tmp(1) = Split(tmp(1), "<strong>", 2)(1)
GetData = tmp
End Function
输出:
尝试一下这个怎么样:
Sub Fetch_Data()
Dim http As New ServerXMLHTTP60, HTML As New HTMLDocument
Dim post As Object
With http
.Open "GET", "https://fortiguard.com/webfilter?q=google.com", False
.send
HTML.body.innerHTML = .responseText
End With
For Each post In HTML.getElementsByClassName("sidebar-content")
With post.getElementsByTagName("em")
If .Length Then Row = Row + 1: Cells(Row, 1) = .Item(0).innerText
End With
With post.getElementsByTagName("strong")
If .Length Then Cells(Row, 2) = .Item(0).innerText
End With
Next post
End Sub
输出:
Jun 10th, 2008 @ 17:45:24 PDT
Search Engines and Portals
参考添加到库:
1.Microsoft Html Object Library
2.Microsoft XML, V6.0