是)我有的:
//This data set contains columns (second index) having the same value in each row (first index)
double[][] dataSet = new double[][]
{
new double[] {1, 2, 3, 4},
new double[] {5, 6, 7, 4},
new double[] {8, 9, 10, 4},
};
我想得到什么:
// This data set has no column where the value in each row is the same
double[][] reducedDataSet = new double[][]
{
new double[] {1, 2, 3},
new double[] {5, 6, 7},
new double[] {8, 9, 10},
};
在python中,这可以通过以下方式轻松完成:
all_equal_value_indices = numpy.all(data_set == data_set[0, :], axis=0) // Finds the indices of all columns that have equal values in each row
reduced_data_set = data_set[:, ~all_equal_value_indices] // Takes all rows with only those columns where all_equal_value_indices is not 1
在C#中我可以得到一个包含应该相对快速排除的索引的数组,但是如何使用这些索引作为掩码才能获得这些索引中不包含的那些列?
我尝试了什么:
var numberOfDeletedColumns = 0;
var reducedDataSet = dataSet;
foreach (var columnToDelete in columnsToDelete)
{
reducedDataSet = reducedDataSet.RemoveColumn(columnToDelete - numberOfDeletedColumns++);
}
RemoveColumn
是Accord.Net提供的扩展,具有以下代码:
/// <summary>Returns a new matrix without one of its columns.</summary>
public static T[][] RemoveColumn<T>(this T[][] matrix, int index)
{
T[][] objArray = new T[matrix.Length][];
for (int index1 = 0; index1 < matrix.Length; ++index1)
{
objArray[index1] = new T[matrix[index1].Length - 1];
for (int index2 = 0; index2 < index; ++index2)
objArray[index1][index2] = matrix[index1][index2];
for (int index2 = index + 1; index2 < matrix[index1].Length; ++index2)
objArray[index1][index2 - 1] = matrix[index1][index2];
}
return objArray;
}
但是这比python中的实现要慢得多。有人可以提出一种更快的方法来实现减少的数据集吗?
Array.Copy
帮助它在我的电脑上运行速度提高了约2倍。
static T[][] FastRemoveColumn<T>(T[][] matrix, int index)
{
T[][] objArray = new T[matrix.Length][];
for (int i = 0; i < matrix.Length; i++)
{
var line = matrix[i];
var reducedline = new T[line.Length - 1];
Array.Copy(line, 0, reducedline, 0, index);
Array.Copy(line, index + 1, reducedline, index, line.Length - index - 1);
objArray[i] = reducedline;
}
return objArray;
}
我也试过多线程。它运行得很慢:
static T[][] MultiThreadRemoveColumn<T>(T[][] matrix, int index)
{
T[][] objArray = new T[matrix.Length][];
Parallel.For(0, matrix.Length, i =>
{
var line = matrix[i];
var reducedline = new T[line.Length - 1];
Array.Copy(line, 0, reducedline, 0, index);
Array.Copy(line, index + 1, reducedline, index, line.Length - index - 1);
objArray[i] = reducedline;
});
return objArray;
}
测试:
// init
double[][] arr = new double[2000][];
for (int i = 0; i < arr.Length; i++)
arr[i] = new double[2000];
double v = 0;
for (int i = 0; i < arr.Length; i++)
{
for (int j = 0; j < arr[i].Length; j++)
{
arr[i][j] = v;
v++;
}
}
Stopwatch sw = Stopwatch.StartNew();
var reducedArr = RemoveColumn(arr, 200);
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
sw.Restart();
var reducedArr2 = FastRemoveColumn(arr, 200);
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
sw.Restart();
var reducedArr3 = MultiThreadRemoveColumn(arr, 200);
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
// Check the result
for (int i = 0; i < reducedArr.Length; i++)
{
for (int j = 0; j < reducedArr[i].Length; j++)
{
if(reducedArr[i][j] != reducedArr2[i][j]) throw new Exception();
if(reducedArr[i][j] != reducedArr3[i][j]) throw new Exception();
}
}
删除多个列的解决方案:
public static T[][] DeleteColumns<T>(T[][] matrix, int[] columns)
{
if (matrix.Length == 0) return new T[0][];
bool[] delColumns = new bool[matrix[0].Length];
foreach (int col in columns) delColumns[col] = true;
List<int> remainCols = new List<int>();
for (int i = 0; i < delColumns.Length; i++)
{
if (!delColumns[i]) remainCols.Add(i);
}
var target = new T[matrix.Length][];
for (int rowIndex = 0; rowIndex < matrix.Length; rowIndex++)
{
T[] sourceRow = matrix[rowIndex];
T[] targetRow = new T[remainCols.Count];
for (int i = 0; i < remainCols.Count; i++)
{
targetRow[i] = sourceRow[remainCols[i]];
}
target[rowIndex] = targetRow;
}
return target;
}
在2000x2000矩阵上测试。与Adam Brown的解决方案相比,测试删除所有列是绝对不公平的,但即使只删除一列,我的解决方案也会更快。
有时,组件在汇总时不能很好地工作。在这种情况下,删除列功能将重新分配整个矩阵,因此操作与要删除的列数成线性关系(ouch)。要解决此问题,请在一次传递中删除所有列。
class Program
{
static void Main(string[] args)
{
var matrix = new[]
{
new [] {1, 2, 3, 4, 5},
new [] {1, 2, 3, 4, 5},
new [] {1, 2, 3, 4, 5},
new [] {1, 2, 3, 4, 5},
};
var result = matrix.DeleteColums(new [] {0, 2, 4});
foreach (var row in result)
{
foreach (var column in row)
{
Console.Write(column);
Console.Write(" ");
}
Console.WriteLine();
}
Console.ReadKey();
}
}
static class MatrixHelper
{
public static T[][] DeleteColums<T>(this T[][] matrix, int[] columns)
{
var sorted = columns.Distinct().OrderBy(e => e).Concat(new [] { int.MaxValue }).ToArray();
var target = new T[matrix.Length][];
for (int row = 0; row < matrix.Length; row++)
{
var sourceRow = matrix[row];
var targetRow = new T[sourceRow.Length - columns.Length];
var sortedIndex = 0;
for (int i = 0; i < sourceRow.Length; i++)
{
if (i == sorted[sortedIndex])
{
sortedIndex++;
continue;
}
targetRow[i - sortedIndex] = sourceRow[i];
}
target[row] = targetRow;
}
return target;
}
}
如果这还不够,那么你需要考虑是否需要使用数组。例如,您可以为矩阵创建一个动态屏蔽列的数据结构,而不是数组数组。
更新:
鉴于本页中的其他解决方案假设矩阵尽管由锯齿状数组表示,但每行具有相同的索引,我想我会再做一个更快的解决方案。在这些假设中,这两个解决方案在这个线程中击败了所有先前的解决方案,包括更快的并行解决方案。
public static T[][] DeleteColumns<T>(this T[][] matrix, int[] columns)
{
if (matrix.Length == 0) return matrix;
//Previous code assumed matrix could be jagged - new code assumes all columns
//present and all rows same length
var rowLength = matrix[0].Length;
if (rowLength == 0) return matrix;
var sorted = columns.Distinct().ToArray();
var target = new T[matrix.Length][];
var remainingLength = rowLength - sorted.Length;
//Allocate the targets all in one go - to avoid doing allocation in parallel.
for (var row = 0; row < matrix.Length; row++)
{
target[row] = new T[remainingLength];
}
//Work out remaining columns (previous code assumed these could
//be different per row, this assumes all rows have the same
//contents.
var remaining = Enumerable.Range(0, rowLength).Except(sorted).ToArray();
for (int row = 0; row < matrix.Length; row++)
{
var sourceRow = matrix[row];
var targetRow = target[row];
for (int i = 0; i < targetRow.Length; i++)
{
targetRow[i] = sourceRow[remaining[i]];
}
}
return target;
}
并行速度越快(并行分配的时间约为总时间的90%):
public static T[][] DeleteColumnsParallel<T>(this T[][] matrix, int[] columns)
{
if (matrix.Length == 0) return matrix;
//Previous code assumed matrix could be jagged - new code assumes all columns
//present and all rows same length
var rowLength = matrix[0].Length;
if (rowLength == 0) return matrix;
var sorted = columns.Distinct().ToArray();
var target = new T[matrix.Length][];
var remainingLength = rowLength - sorted.Length;
//Allocate the targets all in one go - to avoid doing allocation in parallel.
for (var row = 0; row < matrix.Length; row++)
{
target[row] = new T[remainingLength];
}
//Work out remaining columns (previous code assumed these could
//be different per row, this assumes all rows have the same
//contents.
var remaining = Enumerable.Range(0, rowLength).Except(sorted).ToArray();
Parallel.For(0, matrix.Length, row =>
{
var sourceRow = matrix[row];
var targetRow = target[row];
for (int i = 0; i < targetRow.Length; i++)
{
targetRow[i] = sourceRow[remaining[i]];
}
});
return target;
}
10000x10000矩阵的结果随机删除了一半列。
但是,我认为提出一个远远快得多的解决方案非常重要。目前,在最快的并行解决方案中,90%的时间用于分配内存。另一方面,如果你要创建一个拥有它自己的索引器的Matrix类,你将能够动态地假装底层数据结构的某些列不存在。根据您使用矩阵的方式,以及屏蔽行或列的频率,这可能会快得多。