library("topGO")
#list with transcripts ID and one or more associated GO terms
contig2GO=readMappings(file="C:/Users/Documents/transcripts_topGO.txt")
str(head(contig2GO))
names_contig=names(contig2GO)
my_list <- factor(as.integer(names_contig %in% list_of_interest$id))
names(my_list)=names_contig
GOdata_BP = new("topGOdata",ontology="BP",allGenes = my_list,annot=annFUN.gene2GO,gene2GO=contig2GO)
> GOdata_BP
------------------------- topGOdata object -------------------------
Description:
-
Ontology:
- BP
12532 available genes (all genes from the array):
- symbol: tr1 tr2 tr3 ...
- 73 significant genes.
5255 feasible genes (genes that can be used in the analysis):
- symbol: tr1 tr2 tr3 ...
- 38 significant genes.
GO graph (nodes with at least 1 genes):
- a graph with directed edges
- number of nodes = 2340
- number of edges = 4702
------------------------- topGOdata object -------------------------
resultsFisher_BP=runTest(GOdata_BP,algorithm = "classic",statistic = "fisher")
> print(resultsFisher_BP)
Description:
Ontology: BP
'classic' algorithm with the 'fisher' test
2340 GO terms scored: 35 terms with p < 0.01
Annotation data:
Annotated genes: 5255
Significant genes: 38
Min. no. of genes annotated to a GO: 1
Nontrivial nodes: 228
topGO_results_BP <- GenTable(GOdata_BP, classicFisher = resultsFisher_BP, topNodes = 100)
因此,我有35个与pmy_list <- factor(as.integer(names_contig %in% list_of_interest$id))
< 0.01. Now I want to find which are the genes in
相关的术语。 有没有直接的方法来获取它们?现在,我对此进行了交叉的态度:
topGO_results_BP$classicFisher <- as.numeric(topGO_results_BP$classicFisher)
topGO_results_BP$classicFisher <- round(topGO_results_BP$classicFisher,10)
significant_GO_BP <- topGO_results_BP %>% filter(classicFisher < 0.01)
list_of_interest_filt <- list_of_interest %>% filter(!is.na(GO)) %>% mutate(GO_list = str_split(GO, ",")) %>% filter(
sapply(GO_list, function(go_terms) any(go_terms %in% significant_GO_BP$GO.ID))) %>% dplyr::select(-GO_list)
是间接的。我想知道是否有一些事情可以更好地做这项工作?
我知道我提供了代码,而不是可重复的示例。 Transcripts_topgo就像:
genesInTerm
和list_of_interest是一个较大的数据框架,但其中包含Transcripts_topgo的子集,带有成绩单名称和关联的go。
是的,有一种更直接的方法可以使用GenesInterm()提取与富集的GO术语相关的重要基因。此功能使您可以检索映射到topgodata对象中特定go术语的基因。 try:
> set.seed(42)
> transcripts <- paste0("tr", 1:10)
> go_terms <- c("GO:0005515", "GO:0007616", "GO:0016021", "GO:0043226", "GO:46872")
> go_column <- sapply(1:10, function(x) paste(sample(go_terms, sample(1:4, 1), replace = TRUE), collapse = ","))
> transcripts_topGO <- data.frame(Sample = samples, GO = go_column, stringsAsFactors = FALSE)
> transcripts_topGO
transcripts GO
1 tr1 GO:46872
2 tr2 GO:0005515
3 tr3 GO:0043226,GO:0007616
4 tr4 GO:0005515,GO:0043226
5 tr5 GO:46872
6 tr6 GO:0043226,GO:0007616
7 tr7 GO:0016021,GO:0005515
8 tr8 GO:0016021
9 tr9 GO:46872,GO:46872,GO:46872,GO:0043226
10 tr10 GO:0043226,GO:0016021
说实话,我不是Go术语的忠实拥护者。他们太简单了。他们通常不会告诉我们我们不知道的任何事情。