R commands
Published:
useful R commands
Aggregate group_by in python and R
#python, grouping a DataFrame by all columns except the last one
df = df.groupby(list(df.columns[0:-1]), as_index=False,dropna=False).agg({'hsapiens_homolog_ensembl_gene': ','.join})
#R
df_combined <- df %>% group_by(across(-target_column)) %>% summarize(target_column_concat = paste(target_column, collapse = ","), .groups = "drop")
dplyr::select for managing columns
results_above2 <- results_above2 %>%
dplyr::select(-WormBase.ID, -external_gene_name.y, everything(), WormBase.ID, external_gene_name.y)
results_above <- results_above %>%
dplyr::select(geneID, transcript, external_gene_name, everything(), -rowname)
# split chr1_ENSG00000123456.1 to two columns
results_above <- results_above %>%
separate(rowname, into = c("chr_region", "transcript_raw"), sep = "_ENSG", remove = FALSE) %>%
mutate(transcript = paste0("ENSG", str_extract(transcript_raw, "[0-9]+\\.[0-9]+"))) %>%
dplyr::select(-transcript_raw)
pheatmap colors
pheatmap(LFQ_three_groups, colorRampPalette(c("blue", "white", "red"))(64),cex=0.7,scale = "row",show_rownames=F,treeheight_row=100,clustering_method = "ward.D2")
basic table save
write.table(resultRegions[,1:4],file = "~/resultRegions.bed",sep = '\t',row.names = F,col.names = F, quote = F)
# save a list to an excel with multiple sheets
openxlsx::write.xlsx(res.ls, "${ibb_out}/intron_ibb.xlsx", colNames = T, rowNames = T)
install archived R package
url <- "https://cran.r-project.org/src/contrib/Archive/peakPick/peakPick_0.11.tar.gz"
pkgFile <- "peakPick_0.11.tar.gz"
download.file(url = url, destfile = pkgFile)
install.packages(pkgs=pkgFile, type="source", repos=NULL)
unlink(pkgFile)
bind_rows
random.list.genes=list()
for (i in 1:10){
random.list.genes[[i]]=as.data.frame(matrix(CYYYY_in_random_gene(nrow(master_Top)),nrow=1))
}
random.list.genes.df=bind_rows(random.list.genes)
split a column by “;” when the numnbers of “;” are different
protein_separated <- separate(protein,Protein.IDs, into = paste0("protein", 1:max(str_count(protein$Protein.IDs, pattern = ";")+1)), sep = ";", fill = "right")
sort and remove duplicates
gene.sorted=gene[order( gene[,"ensembl_gene_id"], gene[,"bitscore"] ,decreasing = T),]
gene.sorted.uniq=gene.sorted[! duplicated( gene.sorted[,"ensembl_gene_id"]),]
merge without changing order
#create a new variable (column) & assign each element an "id"
#from 1 to the number of rows of df_1
df_1$id <- 1:nrow(df_1)
#merge the two data frames using the label column without sorting
merged <- merge(x = df_1, y = df_2, by = "label", sort = FALSE)
#order the merged data set using "id" & assign it to "ordered"
ordered <- merged[order(merged$id), ]
ggarrange
pval.list.df$pval_min_interaction=apply(pval.list.df[,6:9], 1, FUN = min)
for (p in c(2:9,18)) {
pval.list.df.p=pval.list.df[order(pval.list.df[,p]),][1:24,1]
plot.ls=list()
for (i in 1:24) {
dat=protein_separated[protein_separated$Protein.IDs==pull(pval.list.df.p,Protein.IDs)[i],]
plot.ls[[i]]=ggplot(dat, aes(x=time, y=value,color=treatment)) +
geom_jitter(position=position_jitter(0.1))+
theme(legend.position = "none")+
xlab("") + ylab("")+
theme(plot.margin = unit(c(0.1,0.1,0,0), "cm"),
plot.title = element_text(size = 8))+
labs(title=pull(pval.list.df.p,Protein.IDs)[i])
if (i<21){
plot.ls[[i]]=plot.ls[[i]]+theme(axis.text.x=element_blank())
}
}
figure <- ggarrange(plotlist=plot.ls, nrow = 6,ncol=4, common.legend = TRUE, legend="bottom" ,align="hv")
pdf(paste0(colnames(pval.list.df)[p],".top24.pdf"), width=7, height=11)
print(figure)
while (dev.cur()>1) dev.off()
}