如果我们运行以下命令(感谢@octavioccl的帮助)LINQ查询:
var result = stringsList
.GroupBy(s => s)
.Where(g => g.Count() > 1)
.OrderByDescending(g => g.Count())
.Select(g => g.Key);
它为我们提供了列表中至少出现两次的所有字符串(但完全匹配,即汉明距离=0)。
我只是想知道是否有一个优雅的解决方案(到目前为止我尝试过的所有解决方案都使用循环和丑陋的计数器或正则表达式),我们可以在
Where
子句中指定汉明距离也获取那些位于指定汉明距离范围内的字符串吗?
P.S:所有字符串的长度都相等
更新
非常感谢 krontogiannis 的详细回答。正如我之前提到的,我想获取汉明距离低于给定阈值的字符串列表。他的代码运行得非常好(再次感谢)。
剩下的唯一一件事就是从“结果集”中取出字符串并插入/添加到“列表”中
基本上这就是我想要的:
List<string> outputList = new List<string>();
foreach (string str in patternsList)
{
var rs = wordsList
.GroupBy(w => hamming(w, str))
.Where(h => h.Key <= hammingThreshold)
.OrderByDescending(h => h.Key)
.Select(h => h.Count());
outputList.Add(rs); //I know it won't work but just to show what is needed
}
谢谢
使用 LINQ 计算两个字符串之间的汉明距离可以以一种优雅的方式完成:
Func<string, string, int> hamming = (s1, s2) => s1.Zip(s2, (l, r) => l - r == 0 ? 0 : 1).Sum();
您的问题对“分组”有点模糊。正如您所看到的,计算汉明距离需要两个字符串。因此,您要么需要计算字符串列表中所有单词与输入的汉明距离,要么计算列表中所有单词之间的距离(或者您需要告诉我们的不同内容:-))。
无论如何,我都会给出两个输入示例
var words = new[] {
"hello",
"rellp",
"holla",
"fooba",
"hempd"
};
案例1
var input = "hello";
var hammingThreshold = 3;
var rs = words
.GroupBy(w => hamming(w, input))
.Where(h => h.Key <= hammingThreshold)
.OrderByDescending(h => h.Key);
输出会是这样的
hempd with distance 3
rellp holla with distance 2
hello with distance 0
案例2
var hs = words
.SelectMany((w1, i) =>
words
.Where((w2, j) => i > j)
.Select(w2 => new { Word1 = w1, Word2 = w2 })) // all word pairs except with self
.GroupBy(pair => hamming(pair.Word1, pair.Word2))
.Where(g => g.Key <= hammingThreshold)
.OrderByDescending(g => g.Key);
输出会是这样的
(holla, rellp) (fooba, holla) (hempd, hello) with distance 3
(rellp, hello) (holla, hello) with distance 2
编辑 要仅获取第一组中的单词,您可以使用
SelectMany
var output = rs.SelectMany(g => g).ToList();
OP要求汉明距离,我的算法使用Levenshtein距离算法。但代码很容易转换。
namespace Program
{
public static class Utils
{
public static string LongestCommonSubstring(this IEnumerable<string> arr)
{
// Determine size of the array
var n = arr.Count();
// Take first word from array as reference
var s = arr.ElementAt(0);
var len = s.Length;
var res = "";
for (var i = 0; i < len; i++)
{
for (var j = i + 1; j <= len; j++)
{
// generating all possible substrings
// of our reference string arr[0] i.e s
var stem = s.Substring(i, j - i);
var k = 1;
//for (k = 1; k < n; k++) {
foreach (var item in arr.Skip(1))
{
// Check if the generated stem is
// common to all words
if (!item.Contains(stem))
break;
++k;
}
// If current substring is present in
// all strings and its length is greater
// than current result
if (k == n && res.Length < stem.Length)
res = stem;
}
}
return res;
}
public static HashSet<string> GetShortestGroupedString(this HashSet<string> items, int distanceThreshold = 3, int minimumStringLength = 2)
{
var cluster = new Dictionary<int, List<Tuple<string, string>>>();
var clusterGroups = new HashSet<string>();
var itemCount = items.Count * items.Count;
int k = 0;
var first = items.First();
var added = "";
foreach (var item in items)
//Parallel.ForEach(merged, item => // TODO
{
var computed2 = new List<string>();
foreach (var item2 in items)
{
var distance = LevenshteinDistance.Compute(item, item2);
var firstDistance = LevenshteinDistance.Compute(first, item2);
if (!cluster.ContainsKey(distance)) // TODO: check false
cluster.Add(distance, new List<Tuple<string, string>>());
if (distance > distanceThreshold)
{
++k;
continue;
}
cluster[distance].Add(new Tuple<string, string>(item, item2));
if (firstDistance > distance)
{
var computed = new List<string>();
foreach (var kv in cluster)
{
if (kv.Value.Count == 0) continue;
var longest = kv.Value.Select(dd => dd.Item1).LongestCommonSubstring();
if (string.IsNullOrEmpty(longest)) continue;
computed.Add(longest);
}
var currentAdded = computed.OrderBy(s => s.Length).FirstOrDefault();
var diff = string.IsNullOrEmpty(added) || string.IsNullOrEmpty(currentAdded)
? string.Empty
: currentAdded.Replace(added, string.Empty);
if (!string.IsNullOrEmpty(currentAdded) && diff.Length == currentAdded.Length)
{
var ff = computed2.OrderBy(s => s.Length).FirstOrDefault();
if (ff.Length >= minimumStringLength)
clusterGroups.Add(ff);
computed2.Clear(); // TODO: check false
computed2.Add(diff);
}
else
{
if (diff.Length == 0 && !string.IsNullOrEmpty(added) && !string.IsNullOrEmpty(currentAdded))
computed2.Add(diff);
}
added = currentAdded;
cluster.Clear();
first = item;
}
++k;
}
var f = computed2.OrderBy(s => s.Length).FirstOrDefault();
if (f.Length >= minimumStringLength)
clusterGroups.Add(f);
}
//});
return clusterGroups;
}
}
/// <summary>
/// Contains approximate string matching
/// </summary>
internal static class LevenshteinDistance
{
/// <summary>
/// Compute the distance between two strings.
/// </summary>
public static int Compute(string s, string t)
{
var n = s.Length;
var m = t.Length;
var d = new int[n + 1, m + 1];
// Step 1
if (n == 0)
{
return m;
}
if (m == 0)
{
return n;
}
// Step 2
for (var i = 0; i <= n; d[i, 0] = i++)
{
}
for (var j = 0; j <= m; d[0, j] = j++)
{
}
// Step 3
for (var i = 1; i <= n; i++)
{
//Step 4
for (var j = 1; j <= m; j++)
{
// Step 5
var cost = (t[j - 1] == s[i - 1]) ? 0 : 1;
// Step 6
d[i, j] = Math.Min(
Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1),
d[i - 1, j - 1] + cost);
}
}
// Step 7
return d[n, m];
}
}
}
代码有两个参考:
我的代码正在 https://codereview.stackexchange.com/questions/272379/get-shortest-grouped-distinct-string-from-hashset-of-strings 进行审核,所以我希望对其进行改进。
微软为 TensorOperations 创建了一个 Nuget 包,HammingDistance 是其操作之一
打电话
ReadOnlySpan<byte> byte1 = [1, 2, 3, 4, 5];
ReadOnlySpan<byte> byte2 = [1, 2, 3, 4, 5];
int hammingDistance = System.Numerics.Tensors.TensorPrimitives.HammingDistance(byte1, byte2);
你可以这样做:
int hammingDistance = 2;
var result = stringsList
.GroupBy(s => s.Substring(0, s.Length - hammingDistance))
.Where(g => g.Count() > 1)
.OrderbyDescending(g => g.Count())
.Select(g => g.Key);