Excel VBA搜索PDF和提取和名称页面中的文本

问题描述 投票:2回答:3

我有以下代码,它查看我的电子表格的A列中的每个单元格,搜索它在指定的PDF中找到的文本,然后提取它在文本中找到PDF格式的页面,并将其命名为电子表格的单元格。代码有效但速度很慢,我可能需要在PDF中搜索多达200个单词,这可能长达600页。有没有办法让代码更快?目前,它循环遍历每个单元格搜索遍历每个单词的每个页面,直到它在单元格中找到该单词。

    Sub test_with_PDF()

    Dim objApp As Object
    Dim objPDDoc As Object
    Dim objjso As Object
    Dim wordsCount As Long
    Dim page As Long
    Dim i As Long
    Dim strData As String
    Dim strFileName As String
    Dim lastrow As Long, c As Range
    Dim PageNos As Integer
    Dim newPDF As Acrobat.CAcroPDDoc
    Dim NewName As String
    Dim Folder As String
    lastrow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row

    strFileName = selectFile()
    Folder = GetFolder()

    Set objApp = CreateObject("AcroExch.App")
    Set objPDDoc = CreateObject("AcroExch.PDDoc")
    'AD.1 open file, if =false file is damage
    If objPDDoc.Open(strFileName) Then
        Set objjso = objPDDoc.GetJSObject

 PageNos = 0
 For Each c In Sheets("Sheet1").Range("A2:A" & lastrow)

        For page = 0 To objPDDoc.GetNumPages - 1
            wordsCount = objjso.GetPageNumWords(page)
            For i = 0 To wordsCount

                If InStr(1, c.Value, ", ") = 0 Then

                    If objjso.getPageNthWord(page, i) = c.Value Then
                        PageNos = PageNos + 1
                        If FileExist(Folder & "\" & c.Offset(0, 4) & ".pdf") Then

                                Set newPDF = CreateObject("AcroExch.pdDoc")
                                NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
                                newPDF.Open (NewName)
                                newPDF.InsertPages lngPages, objPDDoc, page, 1, 0
                                newPDF.Save 1, NewName
                                newPDF.Close
                                Set newPDF = Nothing
                                Exit For
                         Else
                                Set newPDF = CreateObject("AcroExch.PDDoc")
                                newPDF.Create
                                NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
                                newPDF.InsertPages -1, objPDDoc, page, 1, 0
                                newPDF.Save 1, NewName
                                newPDF.Close
                                Set newPDF = Nothing
                                Exit For

                        End If
                    End If
                Else

                If objjso.getPageNthWord(page, i) = c.Offset(0, 1).Value Then
                    If objjso.getPageNthWord(page, i + 1) = c.Offset(0, 2).Value Then
                        PageNos = PageNos + 1
                         If FileExist(Folder & "\" & c.Offset(0, 4) & ".pdf") Then

                                Set newPDF = CreateObject("AcroExch.pdDoc")
                                NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
                                newPDF.Open (NewName)
                                newPDF.InsertPages lngPages, objPDDoc, page, 1, 0
                                newPDF.Save 1, NewName
                                newPDF.Close
                                Set newPDF = Nothing
                                Exit For
                         Else
                                Set newPDF = CreateObject("AcroExch.PDDoc")
                                newPDF.Create
                                NewName = Folder & "\" & c.Offset(0, 4) & ".pdf"
                                newPDF.InsertPages -1, objPDDoc, page, 1, 0
                                newPDF.Save 1, NewName
                                newPDF.Close
                                Set newPDF = Nothing
                                Exit For

                        End If
                        Exit For
                    End If
                End If
            End If
            Next i
        Next page
        c.Offset(0, 3).Value = PageNos
        PageNos = 0
    Next c
    MsgBox "Done"
    Else
        MsgBox "error!"
    End If
End Sub

Function FileExist(path As String) As Boolean
    If Dir(path) <> vbNullString Then FileExist = True
End Function
Function selectFile()
Dim fd As FileDialog, fileName As String

On Error GoTo ErrorHandler

Set fd = Application.FileDialog(msoFileDialogFilePicker)

fd.AllowMultiSelect = False

If fd.Show = True Then
    If fd.SelectedItems(1) <> vbNullString Then
        fileName = fd.SelectedItems(1)
    End If
Else
    'Exit code if no file is selected
    End
End If

'Return Selected FileName
selectFile = fileName

Set fd = Nothing

Exit Function

ErrorHandler:
Set fd = Nothing
MsgBox "Error " & Err & ": " & Error(Err)

End Function
Function GetFolder() As String
    Dim fldr As FileDialog
    Dim sItem As String
    Set fldr = Application.FileDialog(msoFileDialogFolderPicker)
    With fldr
        .Title = "Select the Folder where you want you new PDFs to go"
        .AllowMultiSelect = False
        .InitialFileName = Application.DefaultFilePath
        If .Show <> -1 Then GoTo NextCode
        sItem = .SelectedItems(1)
    End With
NextCode:
    GetFolder = sItem
    Set fldr = Nothing
End Function

提前谢谢了。

excel vba excel-vba pdf
3个回答
1
投票

对于某些事情,循环肯定是优秀的,但可以通过这些更高的查询来处理处理。最近,一位同事和我正在做类似的任务(虽然不是pdf相关的),我们在使用range.find方法而不是在每个单元格上执行instr的循环方面取得了很大的成功。

一些兴趣点: - 在使用.find方法时模仿“循环单元格”功能,我们用.cells结束了我们的范围语句,如下所示:

activesheet.usedrange.cells.find()

所需字符串在()内的位置。

- 返回值:“一个Range对象,表示找到该信息的第一个单元格。”

一旦.find方法返回一个范围,后续子程序就可以提取页码和文档名称。

- 如果需要查找事件的第n个实例,“可以使用FindNext和.FindPrevious方法重复搜索。”(Microsoft)

微软概述range.find:https://msdn.microsoft.com/en-us/vba/excel-vba/articles/range-find-method-excel

因此,通过这种方法,用户可以使用基于列表中单元格数的循环来为每个字符串执行.find方法。

下面是(我假设)这必须在excel应用程序中的文本上完成;另外,我没有测试它以确定字符串是否必须单独驻留在单元格中(我不认为这是一个问题)。

‘===================

另一个可能有益的建议是首先使用尽可能少的循环来压缩.pdf中的所有文本(文档对象级别的直接操作)。然后,您的查找/返回方法可以应用于批量文本。

当我从教授的PowerPoints创建学习笔记时,我做了类似的活动;我将所有文本抓取到.txt文件中,然后返回包含字符串列表实例的每个句子。

‘=====================

一些警告:我承认我没有按照你项目的庞大规模执行解析,所以我的建议在实践中可能没有用处。

另外,我在解析.pdf文档时没有做太多工作,因为我尝试先选择任何.txt / excel app,然后改为使用它。

祝你好运;我希望我能够至少提供思考的食物!


0
投票

很抱歉发布一个快速,不完整的答案,但我想我可以指出你的方向。

而不是让系统查找数百亿次的两个术语,然后进行数千亿次比较,将搜索术语放入数组中,并将每个页面的文本转换为长字符串。然后它只需要执行一次每页查找和200次比较。

'Dim your Clipboard functions
Public Declare PtrSafe Function OpenClipboard Lib "user32" (ByVal hwnd As Long) As Long
Public Declare PtrSafe Function EmptyClipboard Lib "user32" () As Long
Public Declare PtrSafe Function CloseClipboard Lib "user32" () As Long

'...

Dim objData As New MSForms.DataObject
Dim arrSearch() As String
Dim strTxt As String

'...

'Create array of search terms
For i = 2 To lastrow
    arrSearch(i - 2) = Sheets("Sheet1").Cells(1, i)
Next i

For page = 0 To objPDDoc.GetNumPages - 1

    '[Move each page into a new document. You already have that code]

    'Clear clipboard
    OpenClipboard (0&)
    EmptyClipboard
    CloseClipboard

    'Copy page to clipboard
    objApp.MenuItemExecute ("SelectAll")
    objApp.MenuItemExecute ("Copy")
    'You can also do this with the JavaScript object: objjso.ExecMenuItem("Item Name")
    'You may have to insert a waiting function like sleep() here to wait for the action to complete

    'Put data from clipboard into a string.
    objData.GetFromClipboard
    strTxt = objData.GetText 'Now you can search the entire content of the page at once, within memory

    'Compare each element of the array to the string
    For i = LBound(arrSearch) To UBound(arrSearch)
        If InStr(1, strTxt, arrSearch(i)) > 0 Then
            '[You found a match. Your code here]
        End If
    Next i

Next page

这仍然很麻烦,因为您必须在新文档中打开每个页面。如果有一个很好的方法来确定你纯粹是通过文本访问哪个页面(例如页面a底部的页码,紧接着是页面b顶部的页眉),那么你可能会看到复制整个页面将文档文本转换为一个字符串,然后使用文本中的线索确定找到匹配后要提取的页面。我相信这会快得多。


0
投票
Sub BatchRenameCS()

Dim objApp As Object
Dim objPDDoc As Object
Dim objjso As Object
Dim newPDF As Acrobat.CAcroPDDoc
Dim lastrow2 As Long
Dim strFileName As String
Dim Folder As String
Dim Page As Long
Dim Cell As Long
Dim PDFCharacterCount() As Long
Dim CharacterCount As Long
Dim i As Integer
Dim c As Integer
Dim x As Integer
Dim strSource As String
Dim strResult As String
Dim PDFCharacters As String
Dim PDFCharacters2 As String
Dim PDFPasteData() As String
Dim PasteDataPage As Integer
Dim LastRow As Long
Dim NewName As String
Dim NewNamePageNum As Integer
Dim Check()

Sheets("Sheet1").Range("C:D").ClearContents

strFileName = selectFile()
Folder = GetFolder()

'create array with pdf word count
Set objApp = CreateObject("AcroExch.App")
Set objPDDoc = CreateObject("AcroExch.PDDoc")
'AD.1 open file, if =false file is damage
    If objPDDoc.Open(strFileName) Then
        Set objjso = objPDDoc.GetJSObject

ReDim PDFCharacterCount(1 To objPDDoc.GetNumPages) As Long

For Page = 1 To objPDDoc.GetNumPages
PDFCharacters = ""
PDFCharacters2 = ""
    For c = 0 To objjso.GetPageNumWords(Page - 1)
    PDFCharacters = PDFCharacters & objjso.getPageNthWord(Page - 1, c)
    Next c
    For i = 1 To Len(PDFCharacters)
        Select Case Asc(Mid(PDFCharacters, i, 1))
            Case 48 To 57, 65 To 90, 97 To 122:
            PDFCharacters2 = PDFCharacters2 & (Mid(PDFCharacters, i, 1))
            Case Else
            PDFCharacters2 = PDFCharacters2 & ""
        End Select
    Next
    PDFCharacterCount(Page) = Len(PDFCharacters2)

Next Page

lastrow2 = Sheets("Sheet2").Cells(Rows.Count, "A").End(xlUp).Row
Page = 1
ReDim PDFPasteData(1 To objPDDoc.GetNumPages) As String
For Cell = 1 To lastrow2
    strResult = ""
    strSource = Sheets("Sheet2").Cells(Cell, 1).Text
    PDFPasteData(Page) = PDFPasteData(Page) & " " & strSource
    For i = 1 To Len(strSource)
        Select Case Asc(Mid(strSource, i, 1))
            Case 48 To 57, 65 To 90, 97 To 122:
            strResult = strResult & (Mid(strSource, i, 1))
            Case Else
            strResult = strResult & ""
        End Select
    Next

CharacterCount = CharacterCount + Len(strResult)

If CharacterCount = PDFCharacterCount(Page) Then
CharacterCount = 0
Page = Page + 1
End If

Next Cell
ReDim Check(2, objPDDoc.GetNumPages)
LastRow = Sheets("Sheet1").Cells(Rows.Count, "A").End(xlUp).Row
For Each LookUpCell In Worksheets("Sheet1").Range("A2:A" & LastRow)
    For PasteDataPage = 1 To objPDDoc.GetNumPages
        If InStr(PDFPasteData(PasteDataPage), LookUpCell.Value) Then
        Check(1, PasteDataPage) = Check(1, PasteDataPage) + 1
        Check(2, PasteDataPage) = Check(2, PasteDataPage) & LookUpCell.Offset(0, 1).Value & Chr(10)
                                If FileExist(Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf") Then

                                        Set newPDF = CreateObject("AcroExch.pdDoc")
                                        NewName = Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf"
                                        newPDF.Open (NewName)
                                        newPDF.InsertPages newPDF.GetNumPages - 1, objPDDoc, PasteDataPage - 1, 1, 0
                                        newPDF.Save 1, NewName
                                        newPDF.Close
                                        Set newPDF = Nothing
                                 Else
                                        Set newPDF = CreateObject("AcroExch.PDDoc")
                                        newPDF.Create
                                        NewName = Folder & "\" & LookUpCell.Offset(0, 1) & ".pdf"
                                        newPDF.InsertPages -1, objPDDoc, PasteDataPage - 1, 1, 0
                                        newPDF.Save 1, NewName
                                        newPDF.Close
                                        Set newPDF = Nothing
                                End If
        End If
    Next PasteDataPage
Next LookUpCell
x = 1
For PasteDataPage = 1 To objPDDoc.GetNumPages
    If Check(1, PasteDataPage) <> 1 Then
    Sheets("Sheet1").Cells(x, 3) = PasteDataPage
    Sheets("Sheet1").Cells(x, 4) = Check(2, PasteDataPage)
    x = x + 1
    End If
Next PasteDataPage
End If
MsgBox "Done"
End Sub
Function FileExist(path As String) As Boolean
    If Dir(path) <> vbNullString Then FileExist = True
End Function
Function selectFile()
Dim fd As FileDialog, fileName As String
On Error GoTo ErrorHandler
Set fd = Application.FileDialog(msoFileDialogFilePicker)
fd.AllowMultiSelect = False
If fd.Show = True Then
    If fd.SelectedItems(1) <> vbNullString Then
        fileName = fd.SelectedItems(1)
    End If
Else
    'Exit code if no file is selected
    End
End If
'Return Selected FileName
selectFile = fileName
Set fd = Nothing
Exit Function
ErrorHandler:
Set fd = Nothing
MsgBox "Error " & Err & ": " & Error(Err)
End Function
Function GetFolder() As String
    Dim fldr As FileDialog
    Dim sItem As String
    Set fldr = Application.FileDialog(msoFileDialogFolderPicker)
    With fldr
        .Title = "Select the Folder where you want you new PDFs to go"
        .AllowMultiSelect = False
        .InitialFileName = Application.DefaultFilePath
        If .Show <> -1 Then GoTo NextCode
        sItem = .SelectedItems(1)
    End With
NextCode:
    GetFolder = sItem
    Set fldr = Nothing

End Function
© www.soinside.com 2019 - 2024. All rights reserved.