Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
303 lines
14 KiB
Plaintext
303 lines
14 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "e2d83e35-e69f-456e-a2fb-77a05b42f43a",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import io\n",
|
||
"import os\n",
|
||
"import pandas as pd\n",
|
||
"import gzip\n",
|
||
"import numpy as np"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "900349f6-d73f-4584-b949-42c1f770d696",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"#create names of all chromosome\n",
|
||
"chr_list = [str(i+1) for i in range(22)]\n",
|
||
"chr_list.extend(['X','Y'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "0bca1990-db14-4f99-a57a-1132515fa21f",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"--2024-11-04 13:33:40-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr15.vcf.bgz\n",
|
||
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.211.251, 142.251.33.91, 142.250.217.91, ...\n",
|
||
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.211.251|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 16904431253 (16G) [application/octet-stream]\n",
|
||
"Saving to: ‘gnomad.genomes.v4.1.sites.chr15.vcf.bgz.1’\n",
|
||
"\n",
|
||
"gnomad.genomes.v4.1 100%[===================>] 15.74G 78.2MB/s in 4m 34s \n",
|
||
"\n",
|
||
"2024-11-04 13:38:15 (58.9 MB/s) - ‘gnomad.genomes.v4.1.sites.chr15.vcf.bgz.1’ saved [16904431253/16904431253]\n",
|
||
"\n",
|
||
"--2024-11-04 13:38:15-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr16.vcf.bgz\n",
|
||
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.217.123, 142.251.215.251, 172.217.14.251, ...\n",
|
||
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.217.123|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 18918703003 (18G) [application/octet-stream]\n",
|
||
"Saving to: ‘gnomad.genomes.v4.1.sites.chr16.vcf.bgz’\n",
|
||
"\n",
|
||
"gnomad.genomes.v4.1 100%[===================>] 17.62G 77.1MB/s in 4m 7s \n",
|
||
"\n",
|
||
"2024-11-04 13:42:23 (73.1 MB/s) - ‘gnomad.genomes.v4.1.sites.chr16.vcf.bgz’ saved [18918703003/18918703003]\n",
|
||
"\n",
|
||
"--2024-11-04 13:42:23-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr17.vcf.bgz\n",
|
||
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.215.251, 172.217.14.251, 142.250.69.219, ...\n",
|
||
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.215.251|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 17412317144 (16G) [application/octet-stream]\n",
|
||
"Saving to: ‘gnomad.genomes.v4.1.sites.chr17.vcf.bgz’\n",
|
||
"\n",
|
||
"gnomad.genomes.v4.1 100%[===================>] 16.22G 67.9MB/s in 4m 1s \n",
|
||
"\n",
|
||
"2024-11-04 13:46:24 (68.9 MB/s) - ‘gnomad.genomes.v4.1.sites.chr17.vcf.bgz’ saved [17412317144/17412317144]\n",
|
||
"\n",
|
||
"--2024-11-04 13:46:25-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr18.vcf.bgz\n",
|
||
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.33.123, 142.251.211.251, 142.251.33.91, ...\n",
|
||
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.33.123|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 14430177524 (13G) [application/octet-stream]\n",
|
||
"Saving to: ‘gnomad.genomes.v4.1.sites.chr18.vcf.bgz’\n",
|
||
"\n",
|
||
"gnomad.genomes.v4.1 100%[===================>] 13.44G 72.8MB/s in 3m 19s \n",
|
||
"\n",
|
||
"2024-11-04 13:49:45 (69.0 MB/s) - ‘gnomad.genomes.v4.1.sites.chr18.vcf.bgz’ saved [14430177524/14430177524]\n",
|
||
"\n",
|
||
"--2024-11-04 13:49:45-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr19.vcf.bgz\n",
|
||
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.69.219, 142.251.33.123, 142.251.211.251, ...\n",
|
||
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.69.219|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 13679473477 (13G) [application/octet-stream]\n",
|
||
"Saving to: ‘gnomad.genomes.v4.1.sites.chr19.vcf.bgz’\n",
|
||
"\n",
|
||
"gnomad.genomes.v4.1 100%[===================>] 12.74G 74.2MB/s in 3m 1s \n",
|
||
"\n",
|
||
"2024-11-04 13:52:46 (72.0 MB/s) - ‘gnomad.genomes.v4.1.sites.chr19.vcf.bgz’ saved [13679473477/13679473477]\n",
|
||
"\n",
|
||
"--2024-11-04 13:52:47-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr20.vcf.bgz\n",
|
||
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.211.251, 142.251.33.91, 142.250.217.91, ...\n",
|
||
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.211.251|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 11838569571 (11G) [application/octet-stream]\n",
|
||
"Saving to: ‘gnomad.genomes.v4.1.sites.chr20.vcf.bgz’\n",
|
||
"\n",
|
||
"gnomad.genomes.v4.1 100%[===================>] 11.03G 79.9MB/s in 2m 38s \n",
|
||
"\n",
|
||
"2024-11-04 13:55:26 (71.2 MB/s) - ‘gnomad.genomes.v4.1.sites.chr20.vcf.bgz’ saved [11838569571/11838569571]\n",
|
||
"\n",
|
||
"--2024-11-04 13:55:26-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr21.vcf.bgz\n",
|
||
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.211.251, 142.251.33.91, 142.250.217.91, ...\n",
|
||
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.211.251|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 7758981978 (7.2G) [application/octet-stream]\n",
|
||
"Saving to: ‘gnomad.genomes.v4.1.sites.chr21.vcf.bgz.1’\n",
|
||
"\n",
|
||
"gnomad.genomes.v4.1 100%[===================>] 7.23G 72.4MB/s in 1m 50s \n",
|
||
"\n",
|
||
"2024-11-04 13:57:16 (67.2 MB/s) - ‘gnomad.genomes.v4.1.sites.chr21.vcf.bgz.1’ saved [7758981978/7758981978]\n",
|
||
"\n",
|
||
"--2024-11-04 13:57:16-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr22.vcf.bgz\n",
|
||
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.215.251, 172.217.14.219, 142.250.69.219, ...\n",
|
||
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.215.251|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 8731339280 (8.1G) [application/octet-stream]\n",
|
||
"Saving to: ‘gnomad.genomes.v4.1.sites.chr22.vcf.bgz’\n",
|
||
"\n",
|
||
"gnomad.genomes.v4.1 100%[===================>] 8.13G 81.2MB/s in 2m 0s \n",
|
||
"\n",
|
||
"2024-11-04 13:59:16 (69.7 MB/s) - ‘gnomad.genomes.v4.1.sites.chr22.vcf.bgz’ saved [8731339280/8731339280]\n",
|
||
"\n",
|
||
"--2024-11-04 13:59:17-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chrX.vcf.bgz\n",
|
||
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.33.123, 142.251.211.251, 142.251.33.91, ...\n",
|
||
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.33.123|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 22908335319 (21G) [application/octet-stream]\n",
|
||
"Saving to: ‘gnomad.genomes.v4.1.sites.chrX.vcf.bgz’\n",
|
||
"\n",
|
||
"gnomad.genomes.v4.1 100%[===================>] 21.33G 76.7MB/s in 5m 6s \n",
|
||
"\n",
|
||
"2024-11-04 14:04:23 (71.5 MB/s) - ‘gnomad.genomes.v4.1.sites.chrX.vcf.bgz’ saved [22908335319/22908335319]\n",
|
||
"\n",
|
||
"--2024-11-04 14:04:23-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chrY.vcf.bgz\n",
|
||
"Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.14.219, 142.250.69.219, 142.251.33.123, ...\n",
|
||
"Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.14.219|:443... connected.\n",
|
||
"HTTP request sent, awaiting response... 200 OK\n",
|
||
"Length: 599108494 (571M) [application/octet-stream]\n",
|
||
"Saving to: ‘gnomad.genomes.v4.1.sites.chrY.vcf.bgz’\n",
|
||
"\n",
|
||
"gnomad.genomes.v4.1 100%[===================>] 571.35M 78.3MB/s in 8.9s \n",
|
||
"\n",
|
||
"2024-11-04 14:04:32 (64.4 MB/s) - ‘gnomad.genomes.v4.1.sites.chrY.vcf.bgz’ saved [599108494/599108494]\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"#download gnomad vcf v4\n",
|
||
"for i in chr_list: \n",
|
||
" !wget https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr{i}.vcf.bgz\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "68ba24a1-d60d-4097-8827-7599a1cc705d",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"#rewrite files to only extract alleles with frequency above freq_filter\n",
|
||
"\n",
|
||
"def rewrite(chr_name, freq_filter):\n",
|
||
" file_neme = f'gnomad.genomes.v4.1.sites.chr{chr_name}.vcf.bgz'\n",
|
||
" file_male_name=f'gnomad.genomes.v4.1.sites.chr{chr_name}.male.txt'\n",
|
||
" file_female_name=f'gnomad.genomes.v4.1.sites.chr{chr_name}.female.txt'\n",
|
||
" \n",
|
||
" #delete old male female files if they exist\n",
|
||
" try:\n",
|
||
" os.remove(file_male_name)\n",
|
||
" except OSError:\n",
|
||
" pass\n",
|
||
" try:\n",
|
||
" os.remove(file_female_name)\n",
|
||
" except OSError:\n",
|
||
" pass\n",
|
||
"\n",
|
||
" with open(file_male_name, 'a') as m_file, open(file_female_name, 'a') as f_file:\n",
|
||
" m_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XY\\n')\n",
|
||
" f_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XX\\n')\n",
|
||
" with gzip.open(file_neme, 'r') as f:\n",
|
||
" for l in f:\n",
|
||
" if not str(l)[2:3]=='#':\n",
|
||
" vcf_pos = str(l)[2:-1]\n",
|
||
" pos = vcf_pos.split('\\\\t')\n",
|
||
" info = pos[-1].split(';')\n",
|
||
" pos = pos[:6]\n",
|
||
" af_XX = list(np.array(info)[[i[:5]=='AF_XX' for i in info]])\n",
|
||
" af_XY = list(np.array(info)[[i[:5]=='AF_XY' for i in info]])\n",
|
||
" #some positions don't have allele frequency\n",
|
||
" if len(af_XX)==1:\n",
|
||
" assert af_XX[0][:5]=='AF_XX'\n",
|
||
" af_XX = af_XX[0].split('=')[1] \n",
|
||
" if float(af_XX)>=freq_filter:\n",
|
||
" f_file.write((',').join(pos)+','+af_XX+'\\n')\n",
|
||
" if len(af_XY)==1:\n",
|
||
" assert af_XY[0][:5]=='AF_XY'\n",
|
||
" af_XY = af_XY[0].split('=')[1] \n",
|
||
" if float(af_XY)>=freq_filter:\n",
|
||
" m_file.write((',').join(pos)+','+af_XY+'\\n')\n",
|
||
" "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "2fdac0b3-f8f3-4540-b050-408672fee807",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"freq_filter = 0.005\n",
|
||
"[rewrite(i, freq_filter) for i in chr_list]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"id": "fb349e47-38d4-4fb2-af02-9a522c64b3fc",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"#concat all chr to one file\n",
|
||
"def concat_all_chr_to_m_f(chr_list):\n",
|
||
" file_male_name = f'gnomad.genomes.v4.1.sites.male.txt'\n",
|
||
" file_female_name = f'gnomad.genomes.v4.1.sites.female.txt'\n",
|
||
" \n",
|
||
" #delete old male female files if they exist\n",
|
||
" try:\n",
|
||
" os.remove(file_male_name)\n",
|
||
" except OSError:\n",
|
||
" pass\n",
|
||
" try:\n",
|
||
" os.remove(file_female_name)\n",
|
||
" except OSError:\n",
|
||
" pass\n",
|
||
" \n",
|
||
" with open(file_male_name, 'a') as m_file, open(file_female_name, 'a') as f_file:\n",
|
||
" m_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XY\\n')\n",
|
||
" f_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XX\\n')\n",
|
||
" for chr_n in chr_list:\n",
|
||
" file_male=f'gnomad.genomes.v4.1.sites.chr{chr_n}.male.txt'\n",
|
||
" with open(file_male) as infile:\n",
|
||
" next(infile)\n",
|
||
" for line in infile:\n",
|
||
" m_file.write(line)\n",
|
||
" if chr_n != 'Y':\n",
|
||
" file_female=f'gnomad.genomes.v4.1.sites.chr{chr_n}.female.txt'\n",
|
||
" with open(file_female) as infile:\n",
|
||
" next(infile)\n",
|
||
" for line in infile:\n",
|
||
" f_file.write(line)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "e170368a-9738-49af-9898-ac85bbe72385",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"concat_all_chr_to_m_f(chr_list)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|