In [25]:
!pip3 install pyarrow
Defaulting to user installation because normal site-packages is not writeable
Collecting pyarrow
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Requirement already satisfied: numpy>=1.16.6 in /nexus/posix0/MAGE-flaski/service/posit/home/wangy/.jupyter/python/3.10/lib/python3.10/site-packages (from pyarrow) (1.24.3)
Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.9/39.9 MB 48.7 MB/s eta 0:00:00:00:0100:01
Installing collected packages: pyarrow
Successfully installed pyarrow-17.0.0

[notice] A new release of pip is available: 24.1.1 -> 24.2
[notice] To update, run: pip install --upgrade pip
In [2]:
import wget
import pandas as pd
import numpy as np
import gzip
import shutil
import os
from pyarrow.parquet import ParquetFile
import pyarrow as pa
In [16]:
# url = 'https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_STARv2.5.3a_junctions.gct.gz'

# output_directory = "/nexus/posix0/MAGE-flaski/service/projects/data/Adam_Antebi/AA_spliceosomics"
# filename = wget.download(url, out=output_directory)

# filename
# filenameunzip = filename.replace('.gz', '')

# with gzip.open(filename, 'rb') as f_in:
#     with open(filenameunzip, 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)
In [5]:
output_directory = "/nexus/posix0/MAGE-flaski/service/projects/data/Adam_Antebi/AA_spliceosomics/"
filenameunzip = output_directory + "GTEx_Analysis_2017-06-05_v8_STARv2.5.3a_junctions.gct"

df=pd.read_csv(filenameunzip, skiprows=2, nrows=10, sep="\t")
In [6]:
df
Out[6]:
Name Description GTEX-1117F-0226-SM-5GZZ7 GTEX-1117F-0426-SM-5EGHI GTEX-1117F-0526-SM-5EGHJ GTEX-1117F-0626-SM-5N9CS GTEX-1117F-0726-SM-5GIEN GTEX-1117F-1326-SM-5EGHH GTEX-1117F-2426-SM-5EGGH GTEX-1117F-2526-SM-5GZY6 ... GTEX-ZZPU-1126-SM-5N9CW GTEX-ZZPU-1226-SM-5N9CK GTEX-ZZPU-1326-SM-5GZWS GTEX-ZZPU-1426-SM-5GZZ6 GTEX-ZZPU-1826-SM-5E43L GTEX-ZZPU-2126-SM-5EGIU GTEX-ZZPU-2226-SM-5EGIV GTEX-ZZPU-2426-SM-5E44I GTEX-ZZPU-2626-SM-5E45Y GTEX-ZZPU-2726-SM-5NQ8O
0 chr1_12058_12178 ENSG00000223972.5 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 chr1_12228_12612 ENSG00000223972.5 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 chr1_12698_12974 ENSG00000223972.5 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 chr1_12722_13220 ENSG00000223972.5 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
4 chr1_13053_13220 ENSG00000223972.5 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 chr1_13375_13452 ENSG00000223972.5 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6 chr1_14502_15004 ENSG00000227232.5 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7 chr1_15039_15795 ENSG00000227232.5 10 8 8 9 3 8 7 23 ... 1 2 1 0 2 1 0 1 1 0
8 chr1_15948_16606 ENSG00000227232.5 20 7 14 17 8 18 14 34 ... 0 0 5 1 2 3 0 0 2 2
9 chr1_16766_16857 ENSG00000227232.5 2 0 2 2 7 0 0 1 ... 10 3 4 1 3 0 7 1 1 1

10 rows × 17384 columns

In [23]:
# url = 'https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_exon_reads.parquet'

# output_directory = "/nexus/posix0/MAGE-flaski/service/projects/data/Adam_Antebi/AA_spliceosomics"
# filename = wget.download(url, out=output_directory)
In [20]:
filename = output_directory + "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_exon_reads.parquet"
pf = ParquetFile(filename) 
first_ten_rows = next(pf.iter_batches(batch_size = 10)) 
df2 = pa.Table.from_batches([first_ten_rows]).to_pandas() 
In [21]:
df2
Out[21]:
Description GTEX-1117F-0226-SM-5GZZ7 GTEX-1117F-0426-SM-5EGHI GTEX-1117F-0526-SM-5EGHJ GTEX-1117F-0626-SM-5N9CS GTEX-1117F-0726-SM-5GIEN GTEX-1117F-1326-SM-5EGHH GTEX-1117F-2426-SM-5EGGH GTEX-1117F-2526-SM-5GZY6 GTEX-1117F-2826-SM-5GZXL ... GTEX-ZZPU-1126-SM-5N9CW GTEX-ZZPU-1226-SM-5N9CK GTEX-ZZPU-1326-SM-5GZWS GTEX-ZZPU-1426-SM-5GZZ6 GTEX-ZZPU-1826-SM-5E43L GTEX-ZZPU-2126-SM-5EGIU GTEX-ZZPU-2226-SM-5EGIV GTEX-ZZPU-2426-SM-5E44I GTEX-ZZPU-2626-SM-5E45Y GTEX-ZZPU-2726-SM-5NQ8O
Name
ENSG00000223972.5_1 DDX11L1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
ENSG00000223972.5_2 DDX11L1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.855263
ENSG00000223972.5_3 DDX11L1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
ENSG00000223972.5_4 DDX11L1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.144737
ENSG00000227232.5_1 WASH7P 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
ENSG00000227232.5_2 WASH7P 1.000000 18.460526 8.394737 23.868422 0.000000 9.078947 5.894737 5.605263 13.447369 ... 7.381579 12.276316 9.644737 6.368421 8.828947 11.592105 7.171053 19.131578 5.960526 6.671053
ENSG00000227232.5_3 WASH7P 1.197368 8.460526 6.960526 11.684211 1.539474 4.907895 1.868421 32.184212 18.539474 ... 5.144737 4.697368 3.118421 4.513158 5.868421 5.947368 4.368421 7.763158 2.789474 5.171053
ENSG00000227232.5_4 WASH7P 13.710526 8.828947 9.960526 25.052631 4.815789 6.184211 14.947368 40.921055 21.565788 ... 5.421053 6.026316 12.144737 7.697368 7.526316 8.921053 4.842105 10.184211 3.486842 8.236842
ENSG00000227232.5_5 WASH7P 23.328947 4.526316 15.078947 33.473682 17.052631 12.500000 34.407894 48.473682 34.250000 ... 5.013158 19.934212 29.223684 11.328947 11.815789 10.894737 4.065789 13.500000 5.355263 7.078947
ENSG00000227232.5_6 WASH7P 20.763159 5.723684 17.144737 23.986841 13.105263 16.026316 22.578947 54.618420 33.039474 ... 12.881579 13.855263 21.578947 15.657895 14.118421 12.552632 5.907895 11.092105 2.460526 11.960526

10 rows × 17383 columns

In [29]:
# histo=pd.read_csv(output_directory + "GTEx Portal.csv")

# histo

# df2_columns=df2.columns.str.replace(r'-SM.*', '', regex=True)

# sum(df2_columns.isin(histo['Tissue Sample ID']))

# df2_columns

# not_matching = df2_columns[~df2_columns.isin(histo['Tissue Sample ID'])]
# not_matching
In [30]:
meta=pd.read_csv(output_directory + "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",sep='\t')
meta
Out[30]:
SAMPID SMATSSCR SMCENTER SMPTHNTS SMRIN SMTS SMTSD SMUBRID SMTSISCH SMTSPAX ... SME1ANTI SMSPLTRD SMBSMMRT SME1SNSE SME1PCTS SMRRNART SME1MPRT SMNUM5CD SMDPMPRT SME2PCTS
0 GTEX-1117F-0003-SM-58Q7G NaN B1 NaN NaN Blood Whole Blood 0013756 1188.0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 GTEX-1117F-0003-SM-5DWSB NaN B1 NaN NaN Blood Whole Blood 0013756 1188.0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 GTEX-1117F-0003-SM-6WBT7 NaN B1 NaN NaN Blood Whole Blood 0013756 1188.0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 GTEX-1117F-0011-R10a-SM-AHZ7F NaN B1, A1 NaN NaN Brain Brain - Frontal Cortex (BA9) 0009834 1193.0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 GTEX-1117F-0011-R10b-SM-CYKQ8 NaN B1, A1 NaN 7.2 Brain Brain - Frontal Cortex (BA9) 0009834 1193.0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
22946 K-562-SM-E9EZC NaN NaN NaN NaN Bone Marrow Cells - Leukemia cell line (CML) EFO_0002067 NaN NaN ... 26289400.0 27814300.0 0.002441 26121600.0 49.8400 0.006370 0.995167 NaN 0.0 50.2621
22947 K-562-SM-E9EZI NaN NaN NaN NaN Bone Marrow Cells - Leukemia cell line (CML) EFO_0002067 NaN NaN ... 26653800.0 28341700.0 0.002336 26553400.0 49.9056 0.006806 0.994802 NaN 0.0 50.2046
22948 K-562-SM-E9EZO NaN NaN NaN NaN Bone Marrow Cells - Leukemia cell line (CML) EFO_0002067 NaN NaN ... 14317500.0 15168000.0 0.001731 14163500.0 49.7298 0.006662 0.994935 NaN 0.0 50.2412
22949 K-562-SM-E9EZT NaN NaN NaN NaN Bone Marrow Cells - Leukemia cell line (CML) EFO_0002067 NaN NaN ... 25459900.0 26906500.0 0.002130 25259100.0 49.8020 0.007145 0.994828 NaN 0.0 50.2529
22950 K-562-SM-E9EZZ NaN NaN NaN NaN Bone Marrow Cells - Leukemia cell line (CML) EFO_0002067 NaN NaN ... 22341200.0 23740600.0 0.001867 22232600.0 49.8781 0.006861 0.993576 NaN 0.0 50.2929

22951 rows × 63 columns

In [32]:
sum(df2.columns.isin(meta['SAMPID'])) 
Out[32]:
17382
In [33]:
age=pd.read_csv(output_directory + "GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt",sep='\t')
age
Out[33]:
SUBJID SEX AGE DTHHRDY
0 GTEX-1117F 2 60-69 4.0
1 GTEX-111CU 1 50-59 0.0
2 GTEX-111FC 1 60-69 1.0
3 GTEX-111VG 1 60-69 3.0
4 GTEX-111YS 1 60-69 0.0
... ... ... ... ...
975 GTEX-ZYY3 2 60-69 4.0
976 GTEX-ZZ64 1 20-29 0.0
977 GTEX-ZZPT 1 50-59 4.0
978 GTEX-ZZPU 2 50-59 0.0
979 K-562 2 50-59 NaN

980 rows × 4 columns

In [49]:
meta['g1'] = meta['SAMPID'].str.split('-', n=2, expand=True)[0]
meta['g2'] = meta['SAMPID'].str.split('-', n=2, expand=True)[1]
meta['SAMP_Group'] =  meta[['g1', 'g2']].agg('-'.join, axis=1)
In [51]:
meta=pd.merge(meta,age,left_on=["SAMP_Group"],right_on="SUBJID", how="left")
meta
Out[51]:
SAMPID SMATSSCR SMCENTER SMPTHNTS SMRIN SMTS SMTSD SMUBRID SMTSISCH SMTSPAX ... SMNUM5CD SMDPMPRT SME2PCTS SAMP_Group g1 g2 SUBJID SEX AGE DTHHRDY
0 GTEX-1117F-0003-SM-58Q7G NaN B1 NaN NaN Blood Whole Blood 0013756 1188.0 NaN ... NaN NaN NaN GTEX-1117F GTEX 1117F GTEX-1117F 2 60-69 4.0
1 GTEX-1117F-0003-SM-5DWSB NaN B1 NaN NaN Blood Whole Blood 0013756 1188.0 NaN ... NaN NaN NaN GTEX-1117F GTEX 1117F GTEX-1117F 2 60-69 4.0
2 GTEX-1117F-0003-SM-6WBT7 NaN B1 NaN NaN Blood Whole Blood 0013756 1188.0 NaN ... NaN NaN NaN GTEX-1117F GTEX 1117F GTEX-1117F 2 60-69 4.0
3 GTEX-1117F-0011-R10a-SM-AHZ7F NaN B1, A1 NaN NaN Brain Brain - Frontal Cortex (BA9) 0009834 1193.0 NaN ... NaN NaN NaN GTEX-1117F GTEX 1117F GTEX-1117F 2 60-69 4.0
4 GTEX-1117F-0011-R10b-SM-CYKQ8 NaN B1, A1 NaN 7.2 Brain Brain - Frontal Cortex (BA9) 0009834 1193.0 NaN ... NaN NaN NaN GTEX-1117F GTEX 1117F GTEX-1117F 2 60-69 4.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
22946 K-562-SM-E9EZC NaN NaN NaN NaN Bone Marrow Cells - Leukemia cell line (CML) EFO_0002067 NaN NaN ... NaN 0.0 50.2621 K-562 K 562 K-562 2 50-59 NaN
22947 K-562-SM-E9EZI NaN NaN NaN NaN Bone Marrow Cells - Leukemia cell line (CML) EFO_0002067 NaN NaN ... NaN 0.0 50.2046 K-562 K 562 K-562 2 50-59 NaN
22948 K-562-SM-E9EZO NaN NaN NaN NaN Bone Marrow Cells - Leukemia cell line (CML) EFO_0002067 NaN NaN ... NaN 0.0 50.2412 K-562 K 562 K-562 2 50-59 NaN
22949 K-562-SM-E9EZT NaN NaN NaN NaN Bone Marrow Cells - Leukemia cell line (CML) EFO_0002067 NaN NaN ... NaN 0.0 50.2529 K-562 K 562 K-562 2 50-59 NaN
22950 K-562-SM-E9EZZ NaN NaN NaN NaN Bone Marrow Cells - Leukemia cell line (CML) EFO_0002067 NaN NaN ... NaN 0.0 50.2929 K-562 K 562 K-562 2 50-59 NaN

22951 rows × 70 columns

In [ ]: