R commands

1 minute read

Published:

useful R commands

Aggregate group_by in python and R

#python, grouping a DataFrame by all columns except the last one
df = df.groupby(list(df.columns[0:-1]), as_index=False,dropna=False).agg({'hsapiens_homolog_ensembl_gene': ','.join})

#R
df_combined <- df %>% group_by(across(-target_column)) %>% summarize(target_column_concat = paste(target_column, collapse = ","), .groups = "drop")

dplyr::select for managing columns

results_above2 <- results_above2 %>%
  dplyr::select(-WormBase.ID, -external_gene_name.y, everything(), WormBase.ID, external_gene_name.y)

results_above <- results_above %>%
  dplyr::select(geneID, transcript, external_gene_name, everything(), -rowname)

# split chr1_ENSG00000123456.1 to two columns
results_above <- results_above %>%
  separate(rowname, into = c("chr_region", "transcript_raw"), sep = "_ENSG", remove = FALSE) %>%
  mutate(transcript = paste0("ENSG", str_extract(transcript_raw, "[0-9]+\\.[0-9]+"))) %>%
  dplyr::select(-transcript_raw)

pheatmap colors

pheatmap(LFQ_three_groups, colorRampPalette(c("blue", "white", "red"))(64),cex=0.7,scale = "row",show_rownames=F,treeheight_row=100,clustering_method = "ward.D2")

basic table save

write.table(resultRegions[,1:4],file = "~/resultRegions.bed",sep = '\t',row.names = F,col.names = F, quote = F)

# save a list to an excel with multiple sheets
openxlsx::write.xlsx(res.ls, "${ibb_out}/intron_ibb.xlsx", colNames = T, rowNames = T)

install archived R package

url <- "https://cran.r-project.org/src/contrib/Archive/peakPick/peakPick_0.11.tar.gz"
pkgFile <- "peakPick_0.11.tar.gz"
download.file(url = url, destfile = pkgFile)
install.packages(pkgs=pkgFile, type="source", repos=NULL)
unlink(pkgFile)

bind_rows

random.list.genes=list()  
for (i in 1:10){
  random.list.genes[[i]]=as.data.frame(matrix(CYYYY_in_random_gene(nrow(master_Top)),nrow=1))
  }
random.list.genes.df=bind_rows(random.list.genes)

split a column by “;” when the numnbers of “;” are different

protein_separated <- separate(protein,Protein.IDs, into = paste0("protein", 1:max(str_count(protein$Protein.IDs, pattern = ";")+1)),   sep = ";", fill = "right")

sort and remove duplicates

gene.sorted=gene[order( gene[,"ensembl_gene_id"], gene[,"bitscore"] ,decreasing = T),]
gene.sorted.uniq=gene.sorted[! duplicated( gene.sorted[,"ensembl_gene_id"]),]

merge without changing order

#create a new variable (column) & assign each element an "id"
#from 1 to the number of rows of df_1
df_1$id <- 1:nrow(df_1) 
 
#merge the two data frames using the label column without sorting
merged <- merge(x = df_1, y = df_2, by = "label", sort = FALSE)
 
#order the merged data set using "id" & assign it to "ordered"
ordered <- merged[order(merged$id), ]

ggarrange

pval.list.df$pval_min_interaction=apply(pval.list.df[,6:9], 1, FUN = min)
for (p in c(2:9,18)) {
  pval.list.df.p=pval.list.df[order(pval.list.df[,p]),][1:24,1]
  plot.ls=list()
  for (i in 1:24) {
    dat=protein_separated[protein_separated$Protein.IDs==pull(pval.list.df.p,Protein.IDs)[i],] 
    plot.ls[[i]]=ggplot(dat, aes(x=time, y=value,color=treatment)) + 
      geom_jitter(position=position_jitter(0.1))+ 
      theme(legend.position = "none")+ 
      xlab("") +  ylab("")+
      theme(plot.margin = unit(c(0.1,0.1,0,0), "cm"),
      plot.title = element_text(size = 8))+
      labs(title=pull(pval.list.df.p,Protein.IDs)[i])
    if (i<21){
      plot.ls[[i]]=plot.ls[[i]]+theme(axis.text.x=element_blank())
    }
  }
  figure <- ggarrange(plotlist=plot.ls, nrow = 6,ncol=4, common.legend = TRUE, legend="bottom" ,align="hv")
  pdf(paste0(colnames(pval.list.df)[p],".top24.pdf"), width=7, height=11)
  print(figure)
  while (dev.cur()>1) dev.off()
}

Tags: