In [1]:
import os
import pandas as pd
import numpy as np
import AGEpy as age
from biomart import BiomartServer
In [2]:
folder = "/nexus/posix0/MAGE-flaski/service/projects/data/Adam_Antebi/AA_spliceosomics/"
anonymous = folder + "anonymous_aging_hits.xlsx"

df_anonymous=pd.read_excel(anonymous)
In [3]:
df_anonymous.head()
Out[3]:
GeneNames gene_id Class chr_id start stop strand type position sites ... log2FC_wt_D0_vs_wt_D7 pvalue_wt_D14_vs_wt_D21 padj_wt_D14_vs_wt_D21 log2FC_wt_D14_vs_wt_D21 pvalue_wt_D7_vs_wt_D14 padj_wt_D7_vs_wt_D14 log2FC_wt_D7_vs_wt_D14 pvalue_wt_D7_vs_wt_D21 padj_wt_D7_vs_wt_D21 log2FC_wt_D7_vs_wt_D21
0 WBGene00002978 sajrG002946 c I 14622304 14622511 -1 ALT INTERNAL ad ... 1.597883 0.644669 1 0.049143 6.353030e-20 2.243961e-16 1.013759 6.329868e-29 3.999084e-25 1.062902
1 WBGene00003738 sajrG039325 = V 12915313 12915555 1 ALT INTERNAL ad ... 0.998421 0.269557 1 0.321127 2.600493e-09 1.198073e-06 1.491427 2.261997e-09 4.438150e-07 1.812554
2 WBGene00005007 sajrG028932 = IV 8892046 8892048 1 ALT INTERNAL aa ... 0.490385 0.448171 1 0.068356 2.050674e-05 2.678814e-03 0.294061 1.764764e-05 1.376472e-03 0.362417
3 WBGene00006759 sajrG030873 = IV 11994770 11995108 -1 ALT INTERNAL ad ... 0.768802 0.769264 1 0.099608 2.799405e-06 5.088198e-04 0.812186 1.119455e-06 1.211044e-04 0.911795
4 WBGene00006780 sajrG028599 c IV 5966653 5966763 1 ALT FIRST sd ... 1.029296 0.824274 1 0.027321 4.292683e-11 2.966524e-08 1.025532 2.593437e-15 1.780958e-12 1.052853

5 rows × 31 columns

In [4]:
biomarthost="http://jul2023.archive.ensembl.org/biomart/"
python_output=folder
species="caenorhabditis elegans"
taxons={"caenorhabditis elegans":"6239","drosophila melanogaster":"7227",\
      "mus musculus":"10090","homo sapiens":"9606", "saccharomyces cerevisiae": "4932", "nothobranchius furzeri": "105023"}
tags={"caenorhabditis elegans":"CEL","drosophila melanogaster":"DMEL",\
      "mus musculus":"MUS","homo sapiens":"HSA"}
taxon_id=taxons[species]
# aging_genes = []
# ### ATTENTION ### if you are using yeast, you will need to uncomment the follwing lines 
# if species in tags.keys():
#     organismtag=tags[species]

#     if not os.path.isfile(python_output+"/homdf.txt"):
#         print("Could not find ageing evidence table. Using biomart to create one.")
#         sys.stdout.flush()
#         homdf,HSA,MUS,CEL,DMEL=age.FilterGOstring(host=biomarthost)
#         homdf.to_csv(python_output+"/homdf.txt", index=None,sep="\t")
#     else:
#         print("Found existing ageing evidence table.")
#         sys.stdout.flush()
#     homdf=pd.read_csv(python_output+"/homdf.txt", sep="\t")
#     aging_genes=homdf[[organismtag+"_ensembl_gene_id","evidence"]].dropna()
#     aging_genes=aging_genes[aging_genes[organismtag+"_ensembl_gene_id"]!="None"]
#     aging_genes=aging_genes[organismtag+"_ensembl_gene_id"].tolist()
In [5]:
server = BiomartServer('http://dec2021.archive.ensembl.org/biomart')
celegans_dataset="celegans_gene_ensembl"
celegans_dataset=server.datasets[celegans_dataset]
celegans_dataset
Out[5]:
Caenorhabditis elegans genes (WBcel235)
In [6]:
attributes = ['ensembl_gene_id','external_gene_name','hsapiens_homolog_ensembl_gene','hsapiens_homolog_associated_gene_name','hsapiens_homolog_orthology_confidence']
response=celegans_dataset.search({'attributes': attributes})
response=response.content.decode().split("\n")
response=[s.split("\t") for s in response ]
response=pd.DataFrame(response, columns=attributes)
response.head()
Out[6]:
ensembl_gene_id external_gene_name hsapiens_homolog_ensembl_gene hsapiens_homolog_associated_gene_name hsapiens_homolog_orthology_confidence
0 WBGene00000001 aap-1 ENSG00000278139 P3R3URF-PIK3R3 0
1 WBGene00000001 aap-1 ENSG00000117461 PIK3R3 0
2 WBGene00000001 aap-1 ENSG00000268173 0
3 WBGene00000001 aap-1 ENSG00000105647 PIK3R2 0
4 WBGene00000002 aat-1 ENSG00000092068 SLC7A8 0
In [11]:
# # aging genes
def check_aging_genes(gene_names, genes_of_interest):
    gene_list = gene_names.split(',')
    return 'yes' if any(gene in genes_of_interest for gene in gene_list) else 'no'

# # Apply the function to create a new column 'AgingGene'
# df_anonymous['AgingGene'] = df_anonymous['GeneNames'].apply(check_aging_genes, genes_of_interest=aging_genes)
# df_anonymous
In [8]:
# df_Yidong['AgingGene'] = df_Yidong['GeneNames'].apply(check_aging_genes, genes_of_interest=aging_genes)
# df_Yidong
In [9]:
# homologs
gene_names_expanded = df_anonymous['GeneNames'].str.split(',').explode()
response_subset = response[response['ensembl_gene_id'].isin(gene_names_expanded)]
response_subset_hs = response_subset[(response_subset['hsapiens_homolog_ensembl_gene'].notna()) & (response_subset['hsapiens_homolog_ensembl_gene'] != '')]
response_subset_hs_ids = response_subset_hs['ensembl_gene_id'].unique()
In [12]:
df_anonymous['Human_homologs'] = df_anonymous['GeneNames'].apply(check_aging_genes, genes_of_interest=response_subset_hs_ids)
df_anonymous.to_excel(folder + "anonymous_aging_hits_annotated.xlsx", index=False)

df_anonymous.head()
Out[12]:
GeneNames gene_id Class chr_id start stop strand type position sites ... pvalue_wt_D14_vs_wt_D21 padj_wt_D14_vs_wt_D21 log2FC_wt_D14_vs_wt_D21 pvalue_wt_D7_vs_wt_D14 padj_wt_D7_vs_wt_D14 log2FC_wt_D7_vs_wt_D14 pvalue_wt_D7_vs_wt_D21 padj_wt_D7_vs_wt_D21 log2FC_wt_D7_vs_wt_D21 Human_homologs
0 WBGene00002978 sajrG002946 c I 14622304 14622511 -1 ALT INTERNAL ad ... 0.644669 1 0.049143 6.353030e-20 2.243961e-16 1.013759 6.329868e-29 3.999084e-25 1.062902 yes
1 WBGene00003738 sajrG039325 = V 12915313 12915555 1 ALT INTERNAL ad ... 0.269557 1 0.321127 2.600493e-09 1.198073e-06 1.491427 2.261997e-09 4.438150e-07 1.812554 no
2 WBGene00005007 sajrG028932 = IV 8892046 8892048 1 ALT INTERNAL aa ... 0.448171 1 0.068356 2.050674e-05 2.678814e-03 0.294061 1.764764e-05 1.376472e-03 0.362417 yes
3 WBGene00006759 sajrG030873 = IV 11994770 11995108 -1 ALT INTERNAL ad ... 0.769264 1 0.099608 2.799405e-06 5.088198e-04 0.812186 1.119455e-06 1.211044e-04 0.911795 yes
4 WBGene00006780 sajrG028599 c IV 5966653 5966763 1 ALT FIRST sd ... 0.824274 1 0.027321 4.292683e-11 2.966524e-08 1.025532 2.593437e-15 1.780958e-12 1.052853 yes

5 rows × 32 columns

In [ ]:
 
In [ ]: