import os
import pandas as pd
import numpy as np
import AGEpy as age
from biomart import BiomartServer

folder = "/nexus/posix0/MAGE-flaski/service/projects/data/Adam_Antebi/AA_spliceosomics/"
anonymous = folder + "anonymous_aging_hits.xlsx"

df_anonymous=pd.read_excel(anonymous)

df_anonymous.head()

biomarthost="http://jul2023.archive.ensembl.org/biomart/"
python_output=folder
species="caenorhabditis elegans"
taxons={"caenorhabditis elegans":"6239","drosophila melanogaster":"7227",\
      "mus musculus":"10090","homo sapiens":"9606", "saccharomyces cerevisiae": "4932", "nothobranchius furzeri": "105023"}
tags={"caenorhabditis elegans":"CEL","drosophila melanogaster":"DMEL",\
      "mus musculus":"MUS","homo sapiens":"HSA"}
taxon_id=taxons[species]
# aging_genes = []
# ### ATTENTION ### if you are using yeast, you will need to uncomment the follwing lines 
# if species in tags.keys():
#     organismtag=tags[species]

#     if not os.path.isfile(python_output+"/homdf.txt"):
#         print("Could not find ageing evidence table. Using biomart to create one.")
#         sys.stdout.flush()
#         homdf,HSA,MUS,CEL,DMEL=age.FilterGOstring(host=biomarthost)
#         homdf.to_csv(python_output+"/homdf.txt", index=None,sep="\t")
#     else:
#         print("Found existing ageing evidence table.")
#         sys.stdout.flush()
#     homdf=pd.read_csv(python_output+"/homdf.txt", sep="\t")
#     aging_genes=homdf[[organismtag+"_ensembl_gene_id","evidence"]].dropna()
#     aging_genes=aging_genes[aging_genes[organismtag+"_ensembl_gene_id"]!="None"]
#     aging_genes=aging_genes[organismtag+"_ensembl_gene_id"].tolist()

server = BiomartServer('http://dec2021.archive.ensembl.org/biomart')
celegans_dataset="celegans_gene_ensembl"
celegans_dataset=server.datasets[celegans_dataset]
celegans_dataset

Caenorhabditis elegans genes (WBcel235)

attributes = ['ensembl_gene_id','external_gene_name','hsapiens_homolog_ensembl_gene','hsapiens_homolog_associated_gene_name','hsapiens_homolog_orthology_confidence']
response=celegans_dataset.search({'attributes': attributes})
response=response.content.decode().split("\n")
response=[s.split("\t") for s in response ]
response=pd.DataFrame(response, columns=attributes)
response.head()

# # aging genes
def check_aging_genes(gene_names, genes_of_interest):
    gene_list = gene_names.split(',')
    return 'yes' if any(gene in genes_of_interest for gene in gene_list) else 'no'

# # Apply the function to create a new column 'AgingGene'
# df_anonymous['AgingGene'] = df_anonymous['GeneNames'].apply(check_aging_genes, genes_of_interest=aging_genes)
# df_anonymous

# df_Yidong['AgingGene'] = df_Yidong['GeneNames'].apply(check_aging_genes, genes_of_interest=aging_genes)
# df_Yidong

# homologs
gene_names_expanded = df_anonymous['GeneNames'].str.split(',').explode()
response_subset = response[response['ensembl_gene_id'].isin(gene_names_expanded)]
response_subset_hs = response_subset[(response_subset['hsapiens_homolog_ensembl_gene'].notna()) & (response_subset['hsapiens_homolog_ensembl_gene'] != '')]
response_subset_hs_ids = response_subset_hs['ensembl_gene_id'].unique()

df_anonymous['Human_homologs'] = df_anonymous['GeneNames'].apply(check_aging_genes, genes_of_interest=response_subset_hs_ids)
df_anonymous.to_excel(folder + "anonymous_aging_hits_annotated.xlsx", index=False)

df_anonymous.head()

	GeneNames	gene_id	Class	chr_id	start	stop	strand	type	position	sites	...	log2FC_wt_D0_vs_wt_D7	pvalue_wt_D14_vs_wt_D21	padj_wt_D14_vs_wt_D21	log2FC_wt_D14_vs_wt_D21	pvalue_wt_D7_vs_wt_D14	padj_wt_D7_vs_wt_D14	log2FC_wt_D7_vs_wt_D14	pvalue_wt_D7_vs_wt_D21	padj_wt_D7_vs_wt_D21	log2FC_wt_D7_vs_wt_D21
0	WBGene00002978	sajrG002946	c	I	14622304	14622511	-1	ALT	INTERNAL	ad	...	1.597883	0.644669	1	0.049143	6.353030e-20	2.243961e-16	1.013759	6.329868e-29	3.999084e-25	1.062902
1	WBGene00003738	sajrG039325	=	V	12915313	12915555	1	ALT	INTERNAL	ad	...	0.998421	0.269557	1	0.321127	2.600493e-09	1.198073e-06	1.491427	2.261997e-09	4.438150e-07	1.812554
2	WBGene00005007	sajrG028932	=	IV	8892046	8892048	1	ALT	INTERNAL	aa	...	0.490385	0.448171	1	0.068356	2.050674e-05	2.678814e-03	0.294061	1.764764e-05	1.376472e-03	0.362417
3	WBGene00006759	sajrG030873	=	IV	11994770	11995108	-1	ALT	INTERNAL	ad	...	0.768802	0.769264	1	0.099608	2.799405e-06	5.088198e-04	0.812186	1.119455e-06	1.211044e-04	0.911795
4	WBGene00006780	sajrG028599	c	IV	5966653	5966763	1	ALT	FIRST	sd	...	1.029296	0.824274	1	0.027321	4.292683e-11	2.966524e-08	1.025532	2.593437e-15	1.780958e-12	1.052853

	ensembl_gene_id	external_gene_name	hsapiens_homolog_ensembl_gene	hsapiens_homolog_associated_gene_name
0	WBGene00000001	aap-1	ENSG00000278139	P3R3URF-PIK3R3
1	WBGene00000001	aap-1	ENSG00000117461	PIK3R3
2	WBGene00000001	aap-1	ENSG00000268173
3	WBGene00000001	aap-1	ENSG00000105647	PIK3R2
4	WBGene00000002	aat-1	ENSG00000092068	SLC7A8

	GeneNames	gene_id	Class	chr_id	start	stop	strand	type	position	sites	...	pvalue_wt_D14_vs_wt_D21	padj_wt_D14_vs_wt_D21	log2FC_wt_D14_vs_wt_D21	pvalue_wt_D7_vs_wt_D14	padj_wt_D7_vs_wt_D14	log2FC_wt_D7_vs_wt_D14	pvalue_wt_D7_vs_wt_D21	padj_wt_D7_vs_wt_D21	log2FC_wt_D7_vs_wt_D21	Human_homologs
0	WBGene00002978	sajrG002946	c	I	14622304	14622511	-1	ALT	INTERNAL	ad	...	0.644669	1	0.049143	6.353030e-20	2.243961e-16	1.013759	6.329868e-29	3.999084e-25	1.062902	yes
1	WBGene00003738	sajrG039325	=	V	12915313	12915555	1	ALT	INTERNAL	ad	...	0.269557	1	0.321127	2.600493e-09	1.198073e-06	1.491427	2.261997e-09	4.438150e-07	1.812554	no
2	WBGene00005007	sajrG028932	=	IV	8892046	8892048	1	ALT	INTERNAL	aa	...	0.448171	1	0.068356	2.050674e-05	2.678814e-03	0.294061	1.764764e-05	1.376472e-03	0.362417	yes
3	WBGene00006759	sajrG030873	=	IV	11994770	11995108	-1	ALT	INTERNAL	ad	...	0.769264	1	0.099608	2.799405e-06	5.088198e-04	0.812186	1.119455e-06	1.211044e-04	0.911795	yes
4	WBGene00006780	sajrG028599	c	IV	5966653	5966763	1	ALT	FIRST	sd	...	0.824274	1	0.027321	4.292683e-11	2.966524e-08	1.025532	2.593437e-15	1.780958e-12	1.052853	yes