VBA将内部文本从html页面传输到excel

问题描述 投票:-1回答:1

此图显示了我使用以下宏的内容 enter image description here

但是这个代码在打开两个或三个URL后停止,我们看到以下错误消息, 1.运行时错误91 2.对象变量或未设置块

Sub test()

Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
Dim n As Integer
Dim i As Integer
Dim HtmlToText As String
Dim result
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row

For i = 2 To lastrow  'Start the loop on the second row of column A. Until the last URL..

    Set wb = CreateObject("internetExplorer.Application")
    sURL = Cells(i, 1)
    wb.navigate sURL
    wb.Visible = False
    While wb.Busy
      DoEvents
    Wend
    'HTML document
    Set doc = wb.document
    Dim Name As Variant
    Dim Posts As Variant
    Dim Followers As Variant
    Dim Following As Variant
    Dim DivValue As Variant
    Dim DivValueSplit As Variant
    Dim DivValueResult As Variant
    Dim Biography As Variant

    Name = doc.getElementsByClassName("AC5d8 notranslate")(0).innerText
    Posts = doc.getElementsByClassName("g47SY")(0).innerText
    Followers = doc.getElementsByClassName("g47SY")(1).innerText
    Following = doc.getElementsByClassName("g47SY")(2).innerText
    'dd = web.document.querySelector("div.-vDIg span").innerText
    DivValue = doc.getElementsByClassName("-vDIg")(0).innerText

    'DivValueSplit = Split(DivValue, "<br>")
    'If UBound(DivValueSplit) = 2 Then
    '   DivValueResult = DivValueSplit(1) & DivValueSplit(2)
     '  j = InStr(DivValueResult, "</span>")
      ' Biography = Mid(DivValueResult, 7, j - 7)
    'ElseIf sURL = "https://www.instagram.com/philipplein/" Then
     ' DivValueResult = DivValueSplit(0)
      'j = InStr(DivValueResult, "</h1>")
      'Biography = Mid(DivValueResult, 19, j - 5)
    'Else
     '   DivValueResult = DivValueSplit(1)
      '  j = InStr(DivValueResult, "</span>")
       ' Biography = Mid(DivValueResult, 7, j - 7)
    'End If

    Worksheets("sheet1").Cells(i, 2) = Name
    Worksheets("sheet1").Cells(i, 3) = Followers
    Worksheets("sheet1").Cells(i, 4) = Following
    Worksheets("sheet1").Cells(i, 5) = Posts
    Worksheets("sheet1").Cells(i, 6) = DivValue
    'Biography = Replace(re1, "<span>", "")

    'Cells(i, 2) = HtmlToText
    ' myarray = Split(Data, vbCrLf)
err_clear:



      If Err <> 0 Then
          Err.Clear
          Resume Next
        End If
        wb.Quit
    Next i

End Sub
html excel vba excel-vba web-scraping
1个回答
2
投票

大纲:

两种方法。一个没有浏览器打开,发布XMLHTTP request,另一个使用Internet Explorer。

如果有一种API方法可以完成这项工作,我肯定会接受它。以下两种方法目前适用于您显示的所有网址。

注意:

这些是基于表格中URL的末尾部分,即人名。见底部图片。

XMLHttpRequest的:

这使用自定义类clsHTTP来保存XMLHTTP object。它有两种方法。一,GetString,发出请求并解析响应的一部分。另一个,GetInfo,取GetString返回的字符串并解析出感兴趣的元素并将它们返回到数组中。

ALL:

该课程可以开发。这是骨头。特别是,它可以处理错误处理,例如,处理服务器连接丢失。

VBA:

clsHTTP

Option Explicit

Private http As Object
Private Sub Class_Initialize()
    Set http = CreateObject("MSXML2.XMLHTTP")
End Sub

Public Function GetString(ByVal URL As String) As String
    Dim sResponse As String
    With http
        .Open "GET", URL, False
        .send
        sResponse = StrConv(.responseBody, vbUnicode)
        GetString = Split(Split(sResponse, "ProfilePage"":")(1), "comments_disabled")(0)   
    End With
End Function

Public Function GetInfo(ByVal sResponse As String) As Variant
    Dim results(0 To 4)
    'Name, Followers,  Following,Posts,Biography
    On Error Resume Next
    results(0) = Split(sResponse, """full_name"":""")(1)
    results(1) = Split(Split(sResponse, """count"":")(1), "}")(0)
    results(2) = Split(Split(sResponse, """count"":")(2), "}")(0)
    results(3) = Split(Split(sResponse, """count"":")(4), ",")(0)
    results(4) = Split(Split(sResponse, """biography"":""")(1), """,")(0)
    On Error GoTo 0
    GetInfo = results
End Function

标准模块module 1

Option Explicit
Public Sub GetInfo()
    Dim http As clsHTTP, sResponse As String, lastRow As Long, groupResults()
    Set http = New clsHTTP
    Const BASE_URL As String = "https://www.instagram.com/"

    With ThisWorkbook.Worksheets("Sheet1")
        lastRow = .Cells(.Rows.Count, "A").End(xlUp).Row
        Select Case lastRow
        Case 1
            Exit Sub
        Case 2
            ReDim arr(1, 1): arr(1, 1) = .Range("A2").Value
        Case Else
            arr = .Range("A2:A" & lastRow).Value
        End Select

        ReDim groupResults(0 To lastRow - 2)
        Dim results(0 To 4), counter As Long, i As Long
        With http
            For i = LBound(arr, 1) To UBound(arr, 1)
                If Len(BASE_URL & arr(i, 1)) > Len(BASE_URL) Then
                     sResponse = .GetString(BASE_URL & arr(i, 1))
                    groupResults(counter) = .GetInfo(sResponse)
                    sResponse = vbNullString
                    counter = counter + 1
                End If
            Next
        End With

        For i = LBound(groupResults) To UBound(groupResults)
            .Cells(2 + i, "B").Resize(1, UBound(results) + 1) = groupResults(i)
        Next
    End With
End Sub

IE浏览器:

稍后我会写一些更好的东西,但是下面将循环置于你创建Internet Explorer对象的位置,这样你就不会继续创建和销毁。它引入了等待元素存在以及页面加载。


ALL:

我会做一些初步的改变:

  1. 重构代码以具有处理数据提取的单独函数/ subs;
  2. 添加方法来管理失败的连接/超时。

VBA:

Option Explicit
Public Sub GetInfo()
    Dim IE As New InternetExplorer, lastRow As Long, arr(), groupResults()

    Const BASE_URL As String = "https://www.instagram.com/"

    With ThisWorkbook.Worksheets("Sheet1")
        lastRow = .Cells(.Rows.Count, "A").End(xlUp).Row
        Select Case lastRow
        Case 1
            Exit Sub
        Case 2
            ReDim arr(1, 1): arr(1, 1) = .Range("A2").Value
        Case Else
            arr = .Range("A2:A" & lastRow).Value
        End Select

        ReDim groupResults(0 To lastRow - 2)
        Dim results(0 To 4), counter As Long, i As Long
        With IE
            .Visible = True

            For i = LBound(arr, 1) To UBound(arr, 1)
                If Len(BASE_URL & arr(i, 1)) > Len(BASE_URL) Then
                    .navigate BASE_URL & arr(i, 1)

                    While .Busy Or .readyState < 4: DoEvents: Wend
                    'Name, Followers,  Following,Posts,Biography
                    Dim aNodeList As Object, ele As Object, t As Date
                    Const MAX_WAIT_SEC As Long = 5

                    t = Timer

                    Do
                        DoEvents
                        On Error Resume Next
                        Set ele = .document.querySelector(".rhpdm")
                        On Error GoTo 0
                        If Timer - t > MAX_WAIT_SEC Then Exit Do
                    Loop While ele Is Nothing

                    '   Application.Wait Now + TimeSerial(0, 0, 2)
                    results(0) = ele.innerText
                    Set aNodeList = .document.querySelectorAll(".g47SY")
                    results(1) = aNodeList.item(0).innerText
                    results(2) = aNodeList.item(1).innerText
                    results(3) = aNodeList.item(2).innerText
                    results(4) = .document.querySelector(".rhpdm ~ span").innerText
                    Set aNodeList = Nothing : Set ele = Nothing
                    groupResults(counter) = results
                    counter = counter + 1
                End If 
            Next        
            .Quit '<== Remember to quit application
        End With

        For i = LBound(groupResults) To UBound(groupResults)
            .Cells(2 + i, "B").Resize(1, UBound(results) + 1) = groupResults(i)
        Next

    End With
End Sub

结果:

enter image description here

© www.soinside.com 2019 - 2024. All rights reserved.