{ "cells": [ { "cell_type": "code", "execution_count": 10, "id": "e2d83e35-e69f-456e-a2fb-77a05b42f43a", "metadata": { "tags": [] }, "outputs": [], "source": [ "import io\n", "import os\n", "import pandas as pd\n", "import gzip\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 16, "id": "900349f6-d73f-4584-b949-42c1f770d696", "metadata": { "tags": [] }, "outputs": [], "source": [ "#create names of all chromosome\n", "chr_list = [str(i+1) for i in range(22)]\n", "chr_list.extend(['X','Y'])" ] }, { "cell_type": "code", "execution_count": 6, "id": "0bca1990-db14-4f99-a57a-1132515fa21f", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2024-11-04 13:33:40-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr15.vcf.bgz\n", "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.211.251, 142.251.33.91, 142.250.217.91, ...\n", "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.211.251|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 16904431253 (16G) [application/octet-stream]\n", "Saving to: ‘gnomad.genomes.v4.1.sites.chr15.vcf.bgz.1’\n", "\n", "gnomad.genomes.v4.1 100%[===================>] 15.74G 78.2MB/s in 4m 34s \n", "\n", "2024-11-04 13:38:15 (58.9 MB/s) - ‘gnomad.genomes.v4.1.sites.chr15.vcf.bgz.1’ saved [16904431253/16904431253]\n", "\n", "--2024-11-04 13:38:15-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr16.vcf.bgz\n", "Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.217.123, 142.251.215.251, 172.217.14.251, ...\n", "Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.217.123|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 18918703003 (18G) [application/octet-stream]\n", "Saving to: ‘gnomad.genomes.v4.1.sites.chr16.vcf.bgz’\n", "\n", "gnomad.genomes.v4.1 100%[===================>] 17.62G 77.1MB/s in 4m 7s \n", "\n", "2024-11-04 13:42:23 (73.1 MB/s) - ‘gnomad.genomes.v4.1.sites.chr16.vcf.bgz’ saved [18918703003/18918703003]\n", "\n", "--2024-11-04 13:42:23-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr17.vcf.bgz\n", "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.215.251, 172.217.14.251, 142.250.69.219, ...\n", "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.215.251|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 17412317144 (16G) [application/octet-stream]\n", "Saving to: ‘gnomad.genomes.v4.1.sites.chr17.vcf.bgz’\n", "\n", "gnomad.genomes.v4.1 100%[===================>] 16.22G 67.9MB/s in 4m 1s \n", "\n", "2024-11-04 13:46:24 (68.9 MB/s) - ‘gnomad.genomes.v4.1.sites.chr17.vcf.bgz’ saved [17412317144/17412317144]\n", "\n", "--2024-11-04 13:46:25-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr18.vcf.bgz\n", "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.33.123, 142.251.211.251, 142.251.33.91, ...\n", "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.33.123|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 14430177524 (13G) [application/octet-stream]\n", "Saving to: ‘gnomad.genomes.v4.1.sites.chr18.vcf.bgz’\n", "\n", "gnomad.genomes.v4.1 100%[===================>] 13.44G 72.8MB/s in 3m 19s \n", "\n", "2024-11-04 13:49:45 (69.0 MB/s) - ‘gnomad.genomes.v4.1.sites.chr18.vcf.bgz’ saved [14430177524/14430177524]\n", "\n", "--2024-11-04 13:49:45-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr19.vcf.bgz\n", "Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.69.219, 142.251.33.123, 142.251.211.251, ...\n", "Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.69.219|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 13679473477 (13G) [application/octet-stream]\n", "Saving to: ‘gnomad.genomes.v4.1.sites.chr19.vcf.bgz’\n", "\n", "gnomad.genomes.v4.1 100%[===================>] 12.74G 74.2MB/s in 3m 1s \n", "\n", "2024-11-04 13:52:46 (72.0 MB/s) - ‘gnomad.genomes.v4.1.sites.chr19.vcf.bgz’ saved [13679473477/13679473477]\n", "\n", "--2024-11-04 13:52:47-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr20.vcf.bgz\n", "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.211.251, 142.251.33.91, 142.250.217.91, ...\n", "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.211.251|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 11838569571 (11G) [application/octet-stream]\n", "Saving to: ‘gnomad.genomes.v4.1.sites.chr20.vcf.bgz’\n", "\n", "gnomad.genomes.v4.1 100%[===================>] 11.03G 79.9MB/s in 2m 38s \n", "\n", "2024-11-04 13:55:26 (71.2 MB/s) - ‘gnomad.genomes.v4.1.sites.chr20.vcf.bgz’ saved [11838569571/11838569571]\n", "\n", "--2024-11-04 13:55:26-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr21.vcf.bgz\n", "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.211.251, 142.251.33.91, 142.250.217.91, ...\n", "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.211.251|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 7758981978 (7.2G) [application/octet-stream]\n", "Saving to: ‘gnomad.genomes.v4.1.sites.chr21.vcf.bgz.1’\n", "\n", "gnomad.genomes.v4.1 100%[===================>] 7.23G 72.4MB/s in 1m 50s \n", "\n", "2024-11-04 13:57:16 (67.2 MB/s) - ‘gnomad.genomes.v4.1.sites.chr21.vcf.bgz.1’ saved [7758981978/7758981978]\n", "\n", "--2024-11-04 13:57:16-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr22.vcf.bgz\n", "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.215.251, 172.217.14.219, 142.250.69.219, ...\n", "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.215.251|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 8731339280 (8.1G) [application/octet-stream]\n", "Saving to: ‘gnomad.genomes.v4.1.sites.chr22.vcf.bgz’\n", "\n", "gnomad.genomes.v4.1 100%[===================>] 8.13G 81.2MB/s in 2m 0s \n", "\n", "2024-11-04 13:59:16 (69.7 MB/s) - ‘gnomad.genomes.v4.1.sites.chr22.vcf.bgz’ saved [8731339280/8731339280]\n", "\n", "--2024-11-04 13:59:17-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chrX.vcf.bgz\n", "Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.33.123, 142.251.211.251, 142.251.33.91, ...\n", "Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.33.123|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 22908335319 (21G) [application/octet-stream]\n", "Saving to: ‘gnomad.genomes.v4.1.sites.chrX.vcf.bgz’\n", "\n", "gnomad.genomes.v4.1 100%[===================>] 21.33G 76.7MB/s in 5m 6s \n", "\n", "2024-11-04 14:04:23 (71.5 MB/s) - ‘gnomad.genomes.v4.1.sites.chrX.vcf.bgz’ saved [22908335319/22908335319]\n", "\n", "--2024-11-04 14:04:23-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chrY.vcf.bgz\n", "Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.14.219, 142.250.69.219, 142.251.33.123, ...\n", "Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.14.219|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 599108494 (571M) [application/octet-stream]\n", "Saving to: ‘gnomad.genomes.v4.1.sites.chrY.vcf.bgz’\n", "\n", "gnomad.genomes.v4.1 100%[===================>] 571.35M 78.3MB/s in 8.9s \n", "\n", "2024-11-04 14:04:32 (64.4 MB/s) - ‘gnomad.genomes.v4.1.sites.chrY.vcf.bgz’ saved [599108494/599108494]\n", "\n" ] } ], "source": [ "#download gnomad vcf v4\n", "for i in chr_list: \n", " !wget https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr{i}.vcf.bgz\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "68ba24a1-d60d-4097-8827-7599a1cc705d", "metadata": { "tags": [] }, "outputs": [], "source": [ "#rewrite files to only extract alleles with frequency above freq_filter\n", "\n", "def rewrite(chr_name, freq_filter):\n", " file_neme = f'gnomad.genomes.v4.1.sites.chr{chr_name}.vcf.bgz'\n", " file_male_name=f'gnomad.genomes.v4.1.sites.chr{chr_name}.male.txt'\n", " file_female_name=f'gnomad.genomes.v4.1.sites.chr{chr_name}.female.txt'\n", " \n", " #delete old male female files if they exist\n", " try:\n", " os.remove(file_male_name)\n", " except OSError:\n", " pass\n", " try:\n", " os.remove(file_female_name)\n", " except OSError:\n", " pass\n", "\n", " with open(file_male_name, 'a') as m_file, open(file_female_name, 'a') as f_file:\n", " m_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XY\\n')\n", " f_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XX\\n')\n", " with gzip.open(file_neme, 'r') as f:\n", " for l in f:\n", " if not str(l)[2:3]=='#':\n", " vcf_pos = str(l)[2:-1]\n", " pos = vcf_pos.split('\\\\t')\n", " info = pos[-1].split(';')\n", " pos = pos[:6]\n", " af_XX = list(np.array(info)[[i[:5]=='AF_XX' for i in info]])\n", " af_XY = list(np.array(info)[[i[:5]=='AF_XY' for i in info]])\n", " #some positions don't have allele frequency\n", " if len(af_XX)==1:\n", " assert af_XX[0][:5]=='AF_XX'\n", " af_XX = af_XX[0].split('=')[1] \n", " if float(af_XX)>=freq_filter:\n", " f_file.write((',').join(pos)+','+af_XX+'\\n')\n", " if len(af_XY)==1:\n", " assert af_XY[0][:5]=='AF_XY'\n", " af_XY = af_XY[0].split('=')[1] \n", " if float(af_XY)>=freq_filter:\n", " m_file.write((',').join(pos)+','+af_XY+'\\n')\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "2fdac0b3-f8f3-4540-b050-408672fee807", "metadata": {}, "outputs": [], "source": [ "freq_filter = 0.005\n", "[rewrite(i, freq_filter) for i in chr_list]" ] }, { "cell_type": "code", "execution_count": 21, "id": "fb349e47-38d4-4fb2-af02-9a522c64b3fc", "metadata": { "tags": [] }, "outputs": [], "source": [ "#concat all chr to one file\n", "def concat_all_chr_to_m_f(chr_list):\n", " file_male_name = f'gnomad.genomes.v4.1.sites.male.txt'\n", " file_female_name = f'gnomad.genomes.v4.1.sites.female.txt'\n", " \n", " #delete old male female files if they exist\n", " try:\n", " os.remove(file_male_name)\n", " except OSError:\n", " pass\n", " try:\n", " os.remove(file_female_name)\n", " except OSError:\n", " pass\n", " \n", " with open(file_male_name, 'a') as m_file, open(file_female_name, 'a') as f_file:\n", " m_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XY\\n')\n", " f_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XX\\n')\n", " for chr_n in chr_list:\n", " file_male=f'gnomad.genomes.v4.1.sites.chr{chr_n}.male.txt'\n", " with open(file_male) as infile:\n", " next(infile)\n", " for line in infile:\n", " m_file.write(line)\n", " if chr_n != 'Y':\n", " file_female=f'gnomad.genomes.v4.1.sites.chr{chr_n}.female.txt'\n", " with open(file_female) as infile:\n", " next(infile)\n", " for line in infile:\n", " f_file.write(line)" ] }, { "cell_type": "code", "execution_count": null, "id": "e170368a-9738-49af-9898-ac85bbe72385", "metadata": { "tags": [] }, "outputs": [], "source": [ "concat_all_chr_to_m_f(chr_list)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }