我有一些代码(感谢另一个用户)打开IE进入一个网站并抓取数据并将其反馈到我的工作表。代码完全按照它应该如何工作,但我有一个小问题。
代码如下
'VBE > Tools > References:'1: Microsoft HTML Object library 2: Microsoft Internet Controls
Public Sub GetSoccerStats()
Dim ie As Object, t As Date
Dim objDoc As New MSHTML.HTMLDocument, text As String
Dim lastRow As Long, dataSheet As Worksheet, inputArray(), i As Long
Const MAX_WAIT_SEC As Long = 10
Set dataSheet = ThisWorkbook.Worksheets("AVG GOAL DATA")
Set ie = CreateObject("InternetExplorer.Application")
With dataSheet
lastRow = .Cells(.Rows.Count, "B").End(xlUp).Row
End With
inputArray = dataSheet.Range("C4:E" & lastRow).Value
inputArray = GetLinks(inputArray)
Dim results(), r As Long, c As Long
ReDim results(1 To UBound(inputArray, 1), 1 To 8)
With ie
.Visible = True
For i = LBound(inputArray, 1) To UBound(inputArray, 1)
r = r + 1
.navigate2 inputArray(i, 4)
While .Busy Or .readyState < 4: DoEvents: Wend
Dim objTable As MSHTML.HTMLTable, objTableRow As MSHTML.HTMLTableRow
Do
DoEvents
On Error Resume Next
Set objTable = .document.getElementsByClassName("table-main leaguestats")(0)
On Error GoTo 0
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While objTable Is Nothing
If Not objTable Is Nothing Then
c = 1
For Each objTableRow In objTable.Rows
text = objTableRow.Cells(0).innerText
Select Case text
Case "Matches played", "Matches remaining", "Home goals", "Away goals"
results(r, c) = objTableRow.Cells(1).innerText
results(r, c + 1) = objTableRow.Cells(2).innerText
c = c + 2
End Select
Next objTableRow
End If
Next
.Quit
End With
dataSheet.Range("F4").Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub
Public Function GetLinks(ByRef inputArray As Variant) As Variant
Dim i As Long
ReDim Preserve inputArray(1 To UBound(inputArray, 1), 1 To UBound(inputArray, 2) + 1)
For i = LBound(inputArray, 1) To UBound(inputArray, 1)
inputArray(i, 4) = IIf(inputArray(i, 1) = "CURRENT", inputArray(i, 2), inputArray(i, 3))
Next
GetLinks = inputArray
End Function
所以,我有一个工作表,其中包含足球联赛列表(在行中),然后列保存数据。此代码从betexplorer.com获取数据并填充此表(每行包含对应于联盟的数据)
目前,此代码在我的工作表上查看,如果在C列中,单词是CURRENT,则使用D列中提供的URL。如果C列包含单词LAST,则从E列获取URL。
问题是本赛季结束时的一些联赛被分成了几组(见https://www.betexplorer.com/soccer/belgium/jupiler-league/stats/)当你注意到你登陆这个页面时它会默认你给'冠军组'提供统计数据然而左边有一个标签那个说'主'。这是我在主选项卡上需要的数据。进一步的问题是并非所有联赛都有这个。据我所知,“主要”统计信息选项卡在HREF标记内有不同的URL,因此VBA可能会使用我在工作表上提供的链接,检查是否存在“主要”选项卡,如果不存在然后只需从该页面中提取数据,如果是,则重定向到“主”选项卡的URL并从该页面中提取数据...
HTML看起来像这样:
<li class="list-tabs__item"><a href="?stage=z3r4t5sS" class="list-tabs__item__in">Main</a></li>
<li class="list-tabs__item"><a href="?stage=hrVVyPkq" class="list-tabs__item__in current">Championship Group</a></li>
<li class="list-tabs__item"><a href="?stage=EPykCdW0" class="list-tabs__item__in">Europa League Group</a></li>
有很多可能的方法来解决这个问题。您正在测试一致的页面之间的差异。例如,锦标赛发生的页面有两排标签。上面有stats选项卡行,下面是Championship选项卡行。 id(如果在整个锦标赛标签页中保持一致)sm-0-0
仅出现在锦标赛标签页中而不是普通页面中。此外,你可能只找到有list-tabs--secondary
课程,其中有第二排冠军
以下是一些可能性:
如果href
具有一致的字符串,唯一标识冠军选项卡,您可以测试它的存在
If ie.document.querySelectorAll("[href*='stage=hrVVyPkq']").length > 0 Then
'Championship tab is present
Else
'Championship tab is not present
End If
或者,您可以按类测试页面上的默认选项卡的长度。如果您将“冠军”选项卡设置为默认值(以及“统计信息”父级/同上选项卡),则获得的长度为2.如果长度为1,则这是正常的。
If ie.document.querySelectorAll(".list-tabs__item__in.current").length > 1 Then
'championship tab present
Else
'Championship tab is not present
End If
测试id(如果在锦标赛页面中始终如一)
If ie.document.querySelectorAll("#sm-0-0").length > 0 Then
'championship tab present
'switch to main
ie.document.querySelector("#sm-0-0 a").click 'or >
ie.document.querySelector(".list-tabs--secondary a").click
Else
'Championship tab is not present
End If
测试辅助选项卡行类
If ie.document.querySelectorAll(".list-tabs--secondary").length > 0 Then
'championship tab present
'switch to main
ie.document.querySelector("#sm-0-0 a").click 'or >
ie.document.querySelector(".list-tabs--secondary a").click
Else
'Championship tab is not present
End If
示例实现:
Option Explicit
'VBE > Tools > References:
' Microsoft Internet Controls
Public Sub GetImageLinks()
Const MAX_WAIT_SEC As Long = 10
Dim ie As New InternetExplorer, i As Long, links(), objTable As MSHTML.HTMLTable, t As Date
links = Array("https://www.betexplorer.com/soccer/austria/tipico-bundesliga/stats/", "https://www.betexplorer.com/soccer/belgium/jupiler-league/stats/")
'the first above has championship tab
With ie
.Visible = True
For i = LBound(links) To UBound(links)
.Navigate2 links(i)
While .Busy Or .readyState < 4: DoEvents: Wend
If .document.querySelectorAll(".list-tabs--secondary").Length > 0 Then
'championship tab present
'switch to main
.document.querySelector(".list-tabs--secondary a").Click
While .Busy Or .readyState < 4: DoEvents: Wend
Else 'you don't need this part
'Championship tab is not present
End If
t = Timer
Do
DoEvents
On Error Resume Next
Set objTable = .document.getElementsByClassName("table-main leaguestats")(0)
On Error GoTo 0
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While objTable Is Nothing
Debug.Print objTable.NamespaceURI
Set objTable = Nothing
Next
Stop
.Quit
End With
End Sub