但是这个代码在打开两个或三个URL后停止,我们看到以下错误消息, 1.运行时错误91 2.对象变量或未设置块
Sub test()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
Dim n As Integer
Dim i As Integer
Dim HtmlToText As String
Dim result
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow 'Start the loop on the second row of column A. Until the last URL..
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = False
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
Dim Name As Variant
Dim Posts As Variant
Dim Followers As Variant
Dim Following As Variant
Dim DivValue As Variant
Dim DivValueSplit As Variant
Dim DivValueResult As Variant
Dim Biography As Variant
Name = doc.getElementsByClassName("AC5d8 notranslate")(0).innerText
Posts = doc.getElementsByClassName("g47SY")(0).innerText
Followers = doc.getElementsByClassName("g47SY")(1).innerText
Following = doc.getElementsByClassName("g47SY")(2).innerText
'dd = web.document.querySelector("div.-vDIg span").innerText
DivValue = doc.getElementsByClassName("-vDIg")(0).innerText
'DivValueSplit = Split(DivValue, "<br>")
'If UBound(DivValueSplit) = 2 Then
' DivValueResult = DivValueSplit(1) & DivValueSplit(2)
' j = InStr(DivValueResult, "</span>")
' Biography = Mid(DivValueResult, 7, j - 7)
'ElseIf sURL = "https://www.instagram.com/philipplein/" Then
' DivValueResult = DivValueSplit(0)
'j = InStr(DivValueResult, "</h1>")
'Biography = Mid(DivValueResult, 19, j - 5)
'Else
' DivValueResult = DivValueSplit(1)
' j = InStr(DivValueResult, "</span>")
' Biography = Mid(DivValueResult, 7, j - 7)
'End If
Worksheets("sheet1").Cells(i, 2) = Name
Worksheets("sheet1").Cells(i, 3) = Followers
Worksheets("sheet1").Cells(i, 4) = Following
Worksheets("sheet1").Cells(i, 5) = Posts
Worksheets("sheet1").Cells(i, 6) = DivValue
'Biography = Replace(re1, "<span>", "")
'Cells(i, 2) = HtmlToText
' myarray = Split(Data, vbCrLf)
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Next i
End Sub
大纲:
两种方法。一个没有浏览器打开,发布XMLHTTP request,另一个使用Internet Explorer。
如果有一种API方法可以完成这项工作,我肯定会接受它。以下两种方法目前适用于您显示的所有网址。
注意:
这些是基于表格中URL的末尾部分,即人名。见底部图片。
这使用自定义类clsHTTP
来保存XMLHTTP object
。它有两种方法。一,GetString
,发出请求并解析响应的一部分。另一个,GetInfo
,取GetString
返回的字符串并解析出感兴趣的元素并将它们返回到数组中。
ALL:
该课程可以开发。这是骨头。特别是,它可以处理错误处理,例如,处理服务器连接丢失。
VBA:
类clsHTTP
:
Option Explicit
Private http As Object
Private Sub Class_Initialize()
Set http = CreateObject("MSXML2.XMLHTTP")
End Sub
Public Function GetString(ByVal URL As String) As String
Dim sResponse As String
With http
.Open "GET", URL, False
.send
sResponse = StrConv(.responseBody, vbUnicode)
GetString = Split(Split(sResponse, "ProfilePage"":")(1), "comments_disabled")(0)
End With
End Function
Public Function GetInfo(ByVal sResponse As String) As Variant
Dim results(0 To 4)
'Name, Followers, Following,Posts,Biography
On Error Resume Next
results(0) = Split(sResponse, """full_name"":""")(1)
results(1) = Split(Split(sResponse, """count"":")(1), "}")(0)
results(2) = Split(Split(sResponse, """count"":")(2), "}")(0)
results(3) = Split(Split(sResponse, """count"":")(4), ",")(0)
results(4) = Split(Split(sResponse, """biography"":""")(1), """,")(0)
On Error GoTo 0
GetInfo = results
End Function
标准模块module 1
:
Option Explicit
Public Sub GetInfo()
Dim http As clsHTTP, sResponse As String, lastRow As Long, groupResults()
Set http = New clsHTTP
Const BASE_URL As String = "https://www.instagram.com/"
With ThisWorkbook.Worksheets("Sheet1")
lastRow = .Cells(.Rows.Count, "A").End(xlUp).Row
Select Case lastRow
Case 1
Exit Sub
Case 2
ReDim arr(1, 1): arr(1, 1) = .Range("A2").Value
Case Else
arr = .Range("A2:A" & lastRow).Value
End Select
ReDim groupResults(0 To lastRow - 2)
Dim results(0 To 4), counter As Long, i As Long
With http
For i = LBound(arr, 1) To UBound(arr, 1)
If Len(BASE_URL & arr(i, 1)) > Len(BASE_URL) Then
sResponse = .GetString(BASE_URL & arr(i, 1))
groupResults(counter) = .GetInfo(sResponse)
sResponse = vbNullString
counter = counter + 1
End If
Next
End With
For i = LBound(groupResults) To UBound(groupResults)
.Cells(2 + i, "B").Resize(1, UBound(results) + 1) = groupResults(i)
Next
End With
End Sub
稍后我会写一些更好的东西,但是下面将循环置于你创建Internet Explorer对象的位置,这样你就不会继续创建和销毁。它引入了等待元素存在以及页面加载。
ALL:
我会做一些初步的改变:
VBA:
Option Explicit
Public Sub GetInfo()
Dim IE As New InternetExplorer, lastRow As Long, arr(), groupResults()
Const BASE_URL As String = "https://www.instagram.com/"
With ThisWorkbook.Worksheets("Sheet1")
lastRow = .Cells(.Rows.Count, "A").End(xlUp).Row
Select Case lastRow
Case 1
Exit Sub
Case 2
ReDim arr(1, 1): arr(1, 1) = .Range("A2").Value
Case Else
arr = .Range("A2:A" & lastRow).Value
End Select
ReDim groupResults(0 To lastRow - 2)
Dim results(0 To 4), counter As Long, i As Long
With IE
.Visible = True
For i = LBound(arr, 1) To UBound(arr, 1)
If Len(BASE_URL & arr(i, 1)) > Len(BASE_URL) Then
.navigate BASE_URL & arr(i, 1)
While .Busy Or .readyState < 4: DoEvents: Wend
'Name, Followers, Following,Posts,Biography
Dim aNodeList As Object, ele As Object, t As Date
Const MAX_WAIT_SEC As Long = 5
t = Timer
Do
DoEvents
On Error Resume Next
Set ele = .document.querySelector(".rhpdm")
On Error GoTo 0
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop While ele Is Nothing
' Application.Wait Now + TimeSerial(0, 0, 2)
results(0) = ele.innerText
Set aNodeList = .document.querySelectorAll(".g47SY")
results(1) = aNodeList.item(0).innerText
results(2) = aNodeList.item(1).innerText
results(3) = aNodeList.item(2).innerText
results(4) = .document.querySelector(".rhpdm ~ span").innerText
Set aNodeList = Nothing : Set ele = Nothing
groupResults(counter) = results
counter = counter + 1
End If
Next
.Quit '<== Remember to quit application
End With
For i = LBound(groupResults) To UBound(groupResults)
.Cells(2 + i, "B").Resize(1, UBound(results) + 1) = groupResults(i)
Next
End With
End Sub
结果: