我对 String.Index 不熟悉,有没有比这更好的方法制作子字符串键:
let seq = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA"
let keysize = 2
let lasti = seq.count - keysize
var counts: [String: Int] = [:]
for i in 0...lasti {
let ii = seq.index(seq.startIndex, offsetBy: i)
let jj = seq.index(ii, offsetBy: keysize)
let key = String( seq[ii..<jj] )
if let v = counts[key] {
counts[key] = v + 1
} else {
counts[key] = 1
}
}
for (k,v) in counts {
print("\(k): \(v)")
}
结果:
CC: 5
TA: 1
TG: 3
GC: 9
CG: 7
GT: 2
GA: 3
CA: 3
AC: 2
TC: 2
AG: 3
AT: 1
TT: 2
GG: 12
AA: 1
CT: 3
你可以尝试使用这个。在 Xcode 16.0 中测试
let seq = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA"
let keysize = 2
var counts: [String: Int] = [:]
for i in seq.indices.dropLast(keysize - 1) {
let key = String(seq[i..<seq.index(i, offsetBy: keysize)])
counts[key, default: 0] += 1
}
官方的 Swift Algorithms 包使用
windows(ofCount: 2)
: 快速完成了这个工作
import Algorithms
let counts = seq
.windows(ofCount: 2)
.reduce(into: [:]) { counts, window in
counts[window, default: 0] += 1
}
counts
.sorted { $0.value > $1.value } // Optional: sort before printing
.forEach { k, v in print("\(k): \(v)") }
您可以使用延迟迭代集合来改进代码
func sequence<T, State>(
state: State,
next: @escaping (inout State) -> T?
) -> UnfoldSequence<T, State>
方法如本帖子所示:
类似:
extension Collection {
func windows(of count: Int) -> UnfoldSequence<SubSequence,Index> {
sequence(state: startIndex) { start in
guard start < endIndex,
let end = index(
start,
offsetBy: count,
limitedBy: endIndex
)
else { return nil }
defer { formIndex(after: &start) }
return self[start..<end]
}
}
func windowsFrequency(of count: Int) -> [SubSequence: Int] where SubSequence: Hashable {
windows(of: count).reduce(into: [:]) { $0[$1, default: 0] += 1 }
}
}
let seq = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA"
let freq = seq.windowsFrequency(of: 2)
print(freq)
for kv in freq {
print(kv)
}
这将打印
[“TT”:2、“TG”:3、“GC”:9、“AT”:1、“GA”:3、“CA”:3、“AC”:2、“TA”:1、 “CC”: 5、“CG”: 7、“TC”: 2、“GG”: 12、“CT”: 3、“GT”: 2、“AG”: 3、“AA”: 1]
(键:“TT”,值:2)
(键:“TG”,值:3)
(键:“GC”,值:9)
(键:“AT”,值:1)
(键:“GA”,值:3)
(键:“CA”,值:3)
(键:“AC”,值:2)
(键:“TA”,值:1)
(键:“CC”,值:5)
(键:“CG”,值:7)
(键:“TC”,值:2)
(键:“GG”,值:12)
(键:“CT”,值:3)
(键:“GT”,值:2)
(键:“AG”,值:3)
(键:“AA”,值:1)
在内部,String 对象可以以不同的编码保存数据。例如,UTF8 使用可变的字节数来存储每个字形。因此,通过索引获取字形的成本相对较高。
String.Index
使您能够编写快速有效地遍历字符串的代码,但是从第一个字形(使用 index(:offsetBy:)
)开始索引并使用计数到末尾的索引,每次调用的时间复杂度都为 O(n)
。因此,您的代码将具有 ≈O(n^2)
(又名“n 平方”)时间复杂度。对于短字符串,这不会是一个大问题,但如果你尝试将其应用于更长的字符串,它的性能会变得非常糟糕。
您应该尝试重写它以使用基于先前索引的 String.Index 。或者,您可以将字符串转换为字符数组,并使用整数索引对其进行索引。这很快,但需要更多内存。
将字符串转换为
Character
数组的方法可能如下所示:
import Foundation
let seq = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA"
let array = Array(seq)
let pairs = NSCountedSet()
for index in 0..<array.count-1 {
let pair = "\(array[index])\(array[index+1])"
pairs.add(pair)
}
pairs.forEach { print($0, pairs.count(for: $0))}
(
NSCountedSet
似乎比字典更适合您的应用程序,尽管您当然可以使用字典。)
我写了一些代码,使用字符数组将你的方法与我的方法进行比较:
import Foundation
let alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
let alphaArray = Array(alphabet)
var seq = ""
var counts: [String: Int] = [:]
func addPair(aPair: String) {
if let v = counts[aPair] {
counts[aPair] = v + 1
} else {
counts[aPair] = 1
}
}
for _ in 0...100_000 {
seq.append(alphaArray[Int.random(in:0..<alphaArray.count)] )
}
var start = Date()
let array = Array(seq)
let pairs = NSCountedSet()
for index in 0..<array.count-1 {
let pair = "\(array[index])\(array[index+1])"
pairs.add(pair)
}
let elapsed1 = -start.timeIntervalSinceNow
print(elapsed1)
start = Date()
let keysize = 2
let lasti = seq.count - keysize
counts = [:]
for i in 0...lasti {
let ii = seq.index(seq.startIndex, offsetBy: i)
let jj = seq.index(ii, offsetBy: keysize)
let key = String( seq[ii..<jj] )
pairs.add(key)
}
let elapsed2 = -start.timeIntervalSinceNow
print(elapsed2)
print("Array processing is \(elapsed2/elapsed1) times faster")
输出是
0.03937792778015137
25.516054034233093
Array processing is 647.9785878193057 times faster
因此,对于 100_000 个字符,基于数组的方法几乎快了 650 倍。对于较大的字符串,从头开始偏移的方法会变得非常非常慢。 (对于大型数据集,N 方性能会很快下降。)
抱歉,我还没有调查已发布的几种不同方法。我在原始程序上苦苦挣扎,发现
.utf8
的性能要高得多(输入 250MB)。
let seq = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA"
let much_faster = seq.utf8
let keysize = 2
let lasti = much_faster.count - keysize
var counts: [String: Int] = [:]
for i in 0...lasti {
let ii = much_faster.index(seq.startIndex, offsetBy: i)
let jj = much_faster.index(ii, offsetBy: keysize)
let key = String( much_faster[ii..<jj] ) ?? ""
if let v = counts[key] {
counts[key] = v + 1
} else {
counts[key] = 1
}
}
我还必须调整一些方法参数为
String.UTF8View
当
seq
为空时,这是一个错误,因为它很乐意创建一个降序范围和 -1 索引访问:
for i in 0...lasti {
我的偏好是在以下条件下进行声明和赋值:
if let key = String( much_faster[ii..<jj] ) {
if let v = counts[key] {
counts[key] = v + 1
} else {
counts[key] = 1
}
}