我有一个工作代码,它采用两个稀疏矩阵,使用数组和 for 循环将它们剪切、展平并将它们拼接在一起形成一个新的稀疏矩阵。我不明白如何在不这样做的情况下切割、压平和拼接它们。 scipy.sparse 包是否有一种原生方法可以做到这一点?
我提供了代码和示例输出。
这是工作代码:
import numpy as np
from scipy import sparse as sps
# Construct 2 sparse matrices
rng = np.random.default_rng()
S_matrix_1 = sps.random(4,5,density=0.5,random_state=rng)
S_matrix_2 = sps.random(4,5,density=0.5,random_state=rng)
# Print the matrices
print('###########################################')
S_array_1 = S_matrix_1.toarray()
print('Sparse Matrix 1: ',S_array_1.shape)
print(S_array_1)
print('###########################################')
S_array_2 = S_matrix_2.toarray()
print('Sparse Matrix 2: ',S_array_2.shape)
print(S_array_2)
#Cut the matrices (trim edges off)
print('###########################################')
S_cut_1 = S_array_1[1:S_array_1.shape[0]-1,1:S_array_1.shape[1]-1]
print('Sparse Matrix 1 cut:', S_cut_1.shape)
print(S_cut_1)
print('###########################################')
S_cut_2 = S_array_2[1:S_array_2.shape[0]-1,1:S_array_2.shape[1]-1]
print('Sparse Matrix 2 cut:', S_cut_2.shape)
print(S_cut_2)
print('###########################################')
# Flatten matrices for stacking
S_vector_1 = S_cut_1.flatten()
S_vector_2 = S_cut_2.flatten()
col = []
data = []
row = []
# Gather Column data so that first matrix is in first row (0)
c = 0
for i in S_vector_1:
if i != 0:
data.append(i)
col.append(c)
row.append(0)
c += 1
# Gather Column data so that second matrix is in second row (1)
c = 0
for i in S_vector_2:
if i != 0:
data.append(i)
col.append(c)
row.append(1)
c += 1
print('Rows,Cols,Data for combined matrix of two flatted sparse matrices')
print(row,col,data)
# Construct combined sparse matrix, each row is a flattened sparse matrix
sparse_matrix = sps.coo_matrix((data, (row, col)),shape=(2,np.prod(S_cut_1.shape)))
print('###########################################')
print('combined sparse matrix, each row is a flattened sparse matrix')
print(sparse_matrix.toarray())
这是一个示例输出:
###########################################
Sparse Matrix 1: (4, 5)
[[0. 0. 0.68617742 0. 0. ]
[0.76825482 0.3582357 0. 0.0248112 0. ]
[0. 0.33983419 0. 0.37449198 0. ]
[0. 0.13352901 0.79960432 0.79339374 0.48830805]]
###########################################
Sparse Matrix 2: (4, 5)
[[0. 0.40899129 0. 0. 0. ]
[0.26789314 0. 0.950261 0. 0.21694661]
[0.51039687 0.29567104 0.30580362 0.42007714 0. ]
[0. 0. 0.25469614 0.02599109 0. ]]
###########################################
Sparse Matrix 1 cut: (2, 3)
[[0.3582357 0. 0.0248112 ]
[0.33983419 0. 0.37449198]]
###########################################
Sparse Matrix 2 cut: (2, 3)
[[0. 0.950261 0. ]
[0.29567104 0.30580362 0.42007714]]
###########################################
Rows,Cols,Data for combined matrix of two flatted sparse matrices
[0, 0, 0, 0, 1, 1, 1, 1] [0, 2, 3, 5, 1, 3, 4, 5] [0.3582356963462584, 0.024811195277762876, 0.33983419283389527, 0.3744919847066006, 0.9502610017191933, 0.2956710439200694, 0.30580361813501, 0.420077135385148]
###########################################
combined sparse matrix, each row is a flattened sparse matrix
[[0.3582357 0. 0.0248112 0.33983419 0. 0.37449198]
[0. 0.950261 0. 0.29567104 0.30580362 0.42007714]]
scipy.sparse 包是否有一种原生方法可以做到这一点?
我认为没有一个函数可以做到这一点——它太具体了。但是您可以编写自己的版本,同时保留 SciPy 和 NumPy 矢量化操作,并且无需转换为密集格式。这将快很多数量级。
总体策略如下:
代码:
def cut_flatten_stack(S1, S2):
assert S1.shape == S2.shape
S1, S2 = S1.tocsr(), S2.tocsr()
S1 = S1[1:-1,1:-1]
S2 = S2[1:-1,1:-1]
S1, S2 = S1.tocoo(), S2.tocoo()
num_rows = 2
num_cols = np.prod(S1.shape)
nnz = S1.nnz + S2.nnz
# Use smaller dtype for columns if it fits
col_dtype = np.int32 if num_cols < np.iinfo(np.int32).max else np.int64
# Flatten indicies of both matrices to columns
col = np.concatenate([
np.ravel_multi_index((S1.row, S1.col), S1.shape),
np.ravel_multi_index((S2.row, S2.col), S1.shape),
], dtype=col_dtype)
# Create array of S1.nnz zeros, then S2.nnz ones
row = np.zeros(nnz, dtype=np.int32)
row[S1.nnz:] = 1
data = np.concatenate([S1.data, S2.data])
return sps.coo_matrix((data, (row, col)), shape=(num_rows, num_cols))
在性能方面,这比在密度为 50% 的 1000x1000 阵列上的初始版本快约 20 倍。
我分析了这段代码,发现它花费了大量时间在 COO 和 CSR 矩阵之间进行转换。由于我们正在进行的切片相当简单,因此我尝试了一个直接操作 COO 索引来完成切片的版本。
这个版本总体上更难理解,但它比以前的版本快了大约 4 倍。
代码:
def cut_coo_matrix(matrix):
"""Remove first and last row and column of COO matrix"""
assert matrix.getformat() == 'coo'
assert matrix.shape[0] > 2 and matrix.shape[1] > 2
mask = matrix.row != 0
mask &= matrix.row != matrix.shape[0] - 1
mask &= matrix.col != 0
mask &= matrix.col != matrix.shape[1] - 1
row = matrix.row[mask] - 1
col = matrix.col[mask] - 1
data = matrix.data[mask]
new_rows = matrix.shape
return data, row, col, (matrix.shape[0] - 2, matrix.shape[1] - 2)
def cut_flatten_stack2(S1, S2, format='csr'):
assert S1.shape == S2.shape
S1_data, S1_row, S1_col, S1_shape = cut_coo_matrix(S1)
S2_data, S2_row, S2_col, _ = cut_coo_matrix(S2)
num_rows = 2
num_cols = np.prod(S1_shape)
nnz = len(S1_data) + len(S2_data)
# Use smaller datadtype for columns if it fits
col_dtype = np.int32 if num_cols < 2**31 - 1 else np.int64
# Flatten indicies of both matrices to columns
col = np.concatenate([
np.ravel_multi_index((S1_row, S1_col), S1_shape),
np.ravel_multi_index((S2_row, S2_col), S1_shape),
], dtype=col_dtype)
data = np.concatenate([S1_data, S2_data])
if format == 'csr':
# Create array to store pointers to beginning of each row
indptr = np.array([0, len(S1_data), len(S1_data) + len(S2_data)])
return sps.csr_matrix((data, col, indptr), shape=(num_rows, num_cols))
elif format == 'coo':
# Create array of S1.nnz zeros, then S2.nnz ones
row = np.zeros(nnz, dtype=np.int32)
row[len(S1_data):] = 1
return sps.coo_matrix((data, (row, col)), shape=(num_rows, num_cols))
此函数还提供了以 CSR 或 COO 格式发出结果的选项。 CSR 速度更快,因为它避免创建大型“行”数组,但如果您将来的步骤需要 COO,那么最好从 COO 开始,而不是稍后转换。