我正在 VBA 中处理非常大的(45,000,000+ 个字符)字符串,我需要删除多余的 空白。
一个空格(又名,ASCII Code 32)可以,但任何具有两个或多个连续空格的部分应减少到只有一个。
: 的循环
Function MyTrim(s As String) As String
Do While InStr(s, " ") > 0
s = Replace$(s, " ", " ")
MyTrim = Trim$(s)
End Function
Len In: 44930886 Len Out: 35322469 Runtime: 247.6 seconds
Option Explicit
Sub Test(ByVal text As String)
Static Regex As Object
If Regex Is Nothing Then
Set Regex = CreateObject("VBScript.RegExp")
Regex.Global = True
Regex.MultiLine = True
End If
Regex.Pattern = " +" ' space, one or more times
Dim result As String: result = Regex.Replace(text, " ")
Debug.Print Len(result), Left(result, 20)
End Sub
输入 4500 万个字符的字符串大约需要一秒钟。
Sub Main()
Const ForReading As Integer = 1
Const FormatUTF16 As Integer = -1 ' aka TriStateTrue
Dim fso As Object: Set fso = CreateObject("Scripting.FileSystemObject")
Dim file As Object: Set file = fso.OpenTextFile("C:\ProgramData\test.txt", ForReading, False, FormatUTF16)
Dim text As String: text = file.ReadAll()
Set file = Nothing
Set fso = Nothing
Debug.Print Len(text), Left(text, 20)
Test (text)
End Sub
var substring = "××\n× ×× ";
var text = String.Join("", Enumerable.Repeat(substring, 45_000_000 / substring.Length));
var encoding = new UnicodeEncoding(false, false);
File.WriteAllText(@"C:\ProgramData\test.txt", text, encoding);
顺便说一句——由于 VBA(VB4、Java、JavaScript、C#、VB...)使用 UTF-16,因此空格字符是一个 UTF-16 代码单元
。 (任何与 ASCII 的相似或比较都是不必要的脑力体操,如果将其作为 ANSI [Chr(32)
] 写入代码,则会在幕后进行不必要的转换,并且对于不同的机器、用户和时间会有不同的行为。)
在 VBA 中,
的大小限制为大约 20 亿个字符。上面的“Replace
理论上,这意味着 20 亿个字符串至少需要 3 个小时——如果它甚至没有崩溃的话——所以它并不完全实用。
,它与 VBA 的 Trim
Function bigTrim(strIn As String) As String Const maxLen = 32766 Dim loops As Long, x As Long loops = Int(Len(strIn) / maxLen) If (Len(strIn) / maxLen) <> loops Then loops = loops + 1 For x = 1 To loops bigTrim = bigTrim & _ Application.WorksheetFunction.Trim(Mid(strIn, _ ((x - 1) * maxLen) + 1, maxLen)) Next x End Function
Len In: 44930886 Len Out: 35321845 Runtime: 33.6 seconds
”方法快了 7 倍以上,and 成功删除了其他方法遗漏的 624 个空格。
事实证明,这里的问题是 VBA 内置
'Works like the inbuilt 'Replace', but only allocates the buffer once and is
'therefore much, much faster on large strings with many replacements
'This function is the renamed function `ReplaceFast` from here:
'Note that this implementation is slightly slower than the inbuilt 'Replace'
'function for short strings with few replacements
Public Function Replace(ByRef str As String, _
ByRef sFind As String, _
ByRef sReplace As String, _
Optional ByVal lStart As Long = 1, _
Optional ByVal lCount As Long = -1, _
Optional ByVal lCompare As VbCompareMethod _
= vbBinaryCompare) As String
Const methodName As String = "Replace"
If lStart < 1 Then Err.Raise 5, methodName, _
"Argument 'lStart' = " & lStart & " < 1, invalid"
If lCount < -1 Then Err.Raise 5, methodName, _
"Argument 'lCount' = " & lCount & " < -1, invalid"
lCount = lCount And &H7FFFFFFF
If Len(str) = 0 Or Len(sFind) = 0 Then
Replace = Mid$(str, lStart)
Exit Function
End If
Dim lenFind As Long: lenFind = Len(sFind)
Dim lenReplace As Long: lenReplace = Len(sReplace)
Dim bufferSizeChange As Long
bufferSizeChange = CountSubstring(str, sFind, lStart, lCount, lCompare) _
* (lenReplace - lenFind) - lStart + 1
If Len(str) + bufferSizeChange < 0 Then Exit Function
Replace = Space$(Len(str) + bufferSizeChange)
Dim i As Long: i = InStr(lStart, str, sFind, lCompare)
Dim j As Long: j = 1
Dim lastOccurrence As Long: lastOccurrence = lStart
Dim count As Long: count = 1
Do Until i = 0 Or count > lCount
Dim diff As Long: diff = i - lastOccurrence
If diff > 0 Then _
Mid$(Replace, j, diff) = Mid$(str, lastOccurrence, diff)
j = j + diff
If lenReplace <> 0 Then
Mid$(Replace, j, lenReplace) = sReplace
j = j + lenReplace
End If
count = count + 1
lastOccurrence = i + lenFind
i = InStr(lastOccurrence, str, sFind, lCompare)
If j <= Len(Replace) Then Mid$(Replace, j) = Mid$(str, lastOccurrence)
End Function
Public Function CountSubstring(ByRef str As String, _
ByRef subStr As String, _
Optional ByVal lStart As Long = 1, _
Optional ByVal lLimit As Long = -1, _
Optional ByVal lCompare As VbCompareMethod _
= vbBinaryCompare) As Long
Const methodName As String = "CountSubstring"
If lStart < 1 Then Err.Raise 5, methodName, _
"Argument 'Start' = " & lStart & " < 1, invalid"
If lLimit < -1 Then Err.Raise 5, methodName, _
"Argument 'lLimit' = " & lLimit & " < -1, invalid"
If subStr = vbNullString Then Exit Function
Dim lenSubStr As Long: lenSubStr = Len(subStr)
Dim i As Long: i = InStr(lStart, str, subStr, lCompare)
CountSubstring = 0
Do Until i = 0 Or lLimit = CountSubstring
CountSubstring = CountSubstring + 1
i = InStr(i + lenSubStr, str, subStr, lCompare)
End Function
函数存在时,原始代码应该只需要大约 1.5 秒来处理类似于 OP 示例的字符串,改进超过 100 倍!:
Sub DemoMyTrim()
Const LEN_INPUT_STR = 45000000
Dim inputStr As String: inputStr = RepeatString(" aaa", LEN_INPUT_STR / 5)
Dim t As Single: t = Timer()
Dim outStr As String: outStr = MyTrim(inputStr)
Debug.Print "Trimming took " & Timer() - t & " seconds."
Debug.Print "Len Out: " & Len(outStr)
End Sub
''RepeatString' function source:
Private Function RepeatString(ByRef str As String, _
Optional ByVal repeatTimes As Long = 2) As String
Const methodName As String = "RepeatString"
If repeatTimes < 0 Then Err.Raise 5, methodName, _
"Argument 'repeatTimes' = " & repeatTimes & " < 0, invalid"
If repeatTimes = 0 Then Exit Function
If LenB(str) = 2 Then
RepeatString = String$(repeatTimes, str)
Exit Function
End If
Dim newLength As Long: newLength = LenB(str) * repeatTimes
RepeatString = Space$((newLength + 1) \ 2)
If newLength Mod 2 = 1 Then RepeatString = MidB$(RepeatString, 2)
MidB$(RepeatString, 1) = str
If repeatTimes > 1 Then MidB$(RepeatString, LenB(str) + 1) = RepeatString
End Function
Public Function MyTrim(ByRef s As String) As String
MyTrim = s
Do While InStr(MyTrim, " ") > 0
MyTrim = Replace(MyTrim, " ", " ")
End Function
由于接受的答案使用 Mac 上不可用的正则表达式,我想提出另一种替代方案,它比原始算法更快,具有改进的
函数,并且仍然可以使用在任何平台上可用的 VBA 内置函数。
这可以通过 LibStringTools 库中的另一个函数实现:
'Replaces consecutive occurrences of 'substring' that repeat more than 'limit'
'times with exactly 'limit' consecutive occurrences
Public Function LimitConsecutiveSubstringRepetition( _
ByRef str As String, _
Optional ByRef subStr As String = vbNewLine, _
Optional ByVal limit As Long = 1, _
Optional ByVal Compare As VbCompareMethod _
= vbBinaryCompare) _
As String
Const methodName As String = "LimitConsecutiveSubstringRepetition"
If limit < 0 Then Err.Raise 5, methodName, _
"Argument 'limit' = " & limit & " < 0, invalid"
If limit = 0 Then
LimitConsecutiveSubstringRepetition = Replace(str, subStr, _
vbNullString, , , Compare)
Exit Function
LimitConsecutiveSubstringRepetition = str
End If
If Len(str) = 0 Then Exit Function
If Len(subStr) = 0 Then Exit Function
Dim i As Long: i = InStr(1, str, subStr, Compare)
Dim j As Long: j = 1
Dim lenSubStr As Long: lenSubStr = Len(subStr)
Dim lastOccurrence As Long: lastOccurrence = 1 - lenSubStr
Dim copyChunkSize As Long
Dim consecutiveCount As Long
Dim occurrenceDiff As Long
Do Until i = 0
occurrenceDiff = i - lastOccurrence
If occurrenceDiff = lenSubStr Then
consecutiveCount = consecutiveCount + 1
If consecutiveCount <= limit Then
copyChunkSize = copyChunkSize + occurrenceDiff
ElseIf consecutiveCount = limit + 1 Then
Mid$(LimitConsecutiveSubstringRepetition, j, copyChunkSize) = _
Mid$(str, i - copyChunkSize, copyChunkSize)
j = j + copyChunkSize
copyChunkSize = 0
End If
copyChunkSize = copyChunkSize + occurrenceDiff
consecutiveCount = 1
End If
lastOccurrence = i
i = InStr(i + lenSubStr, str, subStr, Compare)
copyChunkSize = copyChunkSize + Len(str) - lastOccurrence - lenSubStr + 1
Mid$(LimitConsecutiveSubstringRepetition, j, copyChunkSize) = _
Mid$(str, Len(str) - copyChunkSize + 1)
LimitConsecutiveSubstringRepetition = _
Left$(LimitConsecutiveSubstringRepetition, j + copyChunkSize - 1)
End Function
Dim inputStr as String
'... somehow populate input string
dim outStr as String
outStr = LimitConsecutiveSubstringRepetition(inputStr, " ", 1)