我的纵向数据如下:
import pandas as pd
# Define the updated data with samples only in 'sample_A' or 'sample_B'
data = {
'gene_id': ['gene_1', 'gene_1', 'gene_1', 'gene_1', 'gene_1',
'gene_1', 'gene_1', 'gene_1', 'gene_1', 'gene_1',
'gene_2', 'gene_2', 'gene_2', 'gene_2', 'gene_2',
'gene_2', 'gene_2', 'gene_2', 'gene_2', 'gene_2',
'gene_3', 'gene_3', 'gene_3', 'gene_3', 'gene_3',
'gene_3', 'gene_3', 'gene_3', 'gene_3', 'gene_3'],
'position': [1, 2, 3, 4, 5,
1, 2, 3, 4, 5,
1, 2, 3, 4, 5,
1, 2, 3, 4, 5,
1, 2, 3, 4, 5,
1, 2, 3, 4, 5],
'value': [5.1, 5.5, 5.7, 6.0, 6.3,
6.3, 6.5, 6.7, 6.8, 5.1,
2.3, 2.5, 2.7, 3.0, 3.1,
3.1, 3.2, 3.3, 3.4, 2.3,
3.7, 3.8, 3.9, 4.0, 4.0,
4.0, 4.1, 4.2, 4.3, 3.7],
'sample': ['sample_A', 'sample_A', 'sample_A', 'sample_A', 'sample_B',
'sample_B', 'sample_B', 'sample_B', 'sample_B', 'sample_A',
'sample_A', 'sample_A', 'sample_A', 'sample_A', 'sample_B',
'sample_B', 'sample_B', 'sample_B', 'sample_B', 'sample_A',
'sample_A', 'sample_A', 'sample_A', 'sample_A', 'sample_B',
'sample_B', 'sample_B', 'sample_B', 'sample_B', 'sample_A']
}
# Create the DataFrame
df = pd.DataFrame(data)
我的目标是对基因值概况进行聚类,然后查看这些聚类如何与样本相对应。例如,这里的配置文件定义如下:获取样本,获取gene_id,现在获取结果子集中的所有(位置,值)元组。
通过在这里进行聚类,我有兴趣了解轮廓绘制的曲线的形状和幅度如何聚类。作为开始,简单的 KMeans 就适合我了。
聚类后,我们的想法是将其来自的样本恢复到每个配置文件,然后绘制聚类空间并查看样本如何分布。
我已经在 R 中看到过这个问题的解决方案,但在 python 中还没有看到任何解决方案。如有任何帮助,我们将不胜感激。
如果我正确理解您的问题,您可以使用下面的代码解决您的问题。基本上,您需要通过在
sample
和 gene_id
列上旋转数据框来重塑数据。每个配置文件将包含该样本中该基因的位置值。然后,您对数据应用 K 均值聚类(我根据基因数量使用了 3 个聚类,但您可以轻松更改)。我使用 PCA 来分解聚类并能够绘制数据。此外,我还计算了每个簇的样本分布:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
data = {
'gene_id': ['gene_1', 'gene_1', 'gene_1', 'gene_1', 'gene_1',
'gene_1', 'gene_1', 'gene_1', 'gene_1', 'gene_1',
'gene_2', 'gene_2', 'gene_2', 'gene_2', 'gene_2',
'gene_2', 'gene_2', 'gene_2', 'gene_2', 'gene_2',
'gene_3', 'gene_3', 'gene_3', 'gene_3', 'gene_3',
'gene_3', 'gene_3', 'gene_3', 'gene_3', 'gene_3'],
'position': [1, 2, 3, 4, 5,
1, 2, 3, 4, 5,
1, 2, 3, 4, 5,
1, 2, 3, 4, 5,
1, 2, 3, 4, 5,
1, 2, 3, 4, 5],
'value': [5.1, 5.5, 5.7, 6.0, 6.3,
6.3, 6.5, 6.7, 6.8, 5.1,
2.3, 2.5, 2.7, 3.0, 3.1,
3.1, 3.2, 3.3, 3.4, 2.3,
3.7, 3.8, 3.9, 4.0, 4.0,
4.0, 4.1, 4.2, 4.3, 3.7],
'sample': ['sample_A', 'sample_A', 'sample_A', 'sample_A', 'sample_B',
'sample_B', 'sample_B', 'sample_B', 'sample_B', 'sample_A',
'sample_A', 'sample_A', 'sample_A', 'sample_A', 'sample_B',
'sample_B', 'sample_B', 'sample_B', 'sample_B', 'sample_A',
'sample_A', 'sample_A', 'sample_A', 'sample_A', 'sample_B',
'sample_B', 'sample_B', 'sample_B', 'sample_B', 'sample_A']
}
df = pd.DataFrame(data)
pivot_df = df.pivot_table(index=['sample', 'gene_id'], columns='position', values='value').reset_index()
# Perform K-means clustering
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
profiles = pivot_df.drop(columns=['sample', 'gene_id'])
kmeans.fit(profiles)
pivot_df['cluster'] = kmeans.labels_
# Reduce dimensions using PCA
pca = PCA(n_components=2)
profiles_pca = pca.fit_transform(profiles)
plot_df = pd.DataFrame(profiles_pca, columns=['PC1', 'PC2'])
plot_df['cluster'] = kmeans.labels_
plot_df['sample'] = pivot_df['sample']
plot_df['gene_id'] = pivot_df['gene_id']
# Build the plot
plt.figure(figsize=(8,6))
markers = {'sample_A': 'o', 'sample_B': 's'}
for cluster in range(n_clusters):
cluster_data = plot_df[plot_df['cluster'] == cluster]
for sample in ['sample_A', 'sample_B']:
sample_data = cluster_data[cluster_data['sample'] == sample]
plt.scatter(sample_data['PC1'], sample_data['PC2'],
marker=markers[sample],
color=colors[cluster],
label=f'Cluster {cluster} - {sample}',
alpha=0.7)
plt.title('KMeans Clustering of Gene Profiles with Sample Distribution (PCA Reduced)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()
sample_cluster_distribution = pivot_df.groupby(['cluster', 'sample']).size().unstack().fillna(0)
print(sample_cluster_distribution)
我希望这是你想做的。 干杯!