Files
digital-patients/Download_write_healthy_m_f_txt_file.ipynb
Olamide Isreal 9e6a16c19b Initial commit: digital-patients pipeline (clean, no large files)
Large reference/model files excluded from repo - to be staged to S3 or baked into Docker images.
2026-03-26 15:15:23 +01:00

303 lines
14 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "e2d83e35-e69f-456e-a2fb-77a05b42f43a",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import io\n",
"import os\n",
"import pandas as pd\n",
"import gzip\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "900349f6-d73f-4584-b949-42c1f770d696",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"#create names of all chromosome\n",
"chr_list = [str(i+1) for i in range(22)]\n",
"chr_list.extend(['X','Y'])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0bca1990-db14-4f99-a57a-1132515fa21f",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2024-11-04 13:33:40-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr15.vcf.bgz\n",
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.211.251, 142.251.33.91, 142.250.217.91, ...\n",
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.211.251|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 16904431253 (16G) [application/octet-stream]\n",
"Saving to: gnomad.genomes.v4.1.sites.chr15.vcf.bgz.1\n",
"\n",
"gnomad.genomes.v4.1 100%[===================>] 15.74G 78.2MB/s in 4m 34s \n",
"\n",
"2024-11-04 13:38:15 (58.9 MB/s) - gnomad.genomes.v4.1.sites.chr15.vcf.bgz.1 saved [16904431253/16904431253]\n",
"\n",
"--2024-11-04 13:38:15-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr16.vcf.bgz\n",
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.217.123, 142.251.215.251, 172.217.14.251, ...\n",
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.217.123|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 18918703003 (18G) [application/octet-stream]\n",
"Saving to: gnomad.genomes.v4.1.sites.chr16.vcf.bgz\n",
"\n",
"gnomad.genomes.v4.1 100%[===================>] 17.62G 77.1MB/s in 4m 7s \n",
"\n",
"2024-11-04 13:42:23 (73.1 MB/s) - gnomad.genomes.v4.1.sites.chr16.vcf.bgz saved [18918703003/18918703003]\n",
"\n",
"--2024-11-04 13:42:23-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr17.vcf.bgz\n",
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.215.251, 172.217.14.251, 142.250.69.219, ...\n",
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.215.251|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 17412317144 (16G) [application/octet-stream]\n",
"Saving to: gnomad.genomes.v4.1.sites.chr17.vcf.bgz\n",
"\n",
"gnomad.genomes.v4.1 100%[===================>] 16.22G 67.9MB/s in 4m 1s \n",
"\n",
"2024-11-04 13:46:24 (68.9 MB/s) - gnomad.genomes.v4.1.sites.chr17.vcf.bgz saved [17412317144/17412317144]\n",
"\n",
"--2024-11-04 13:46:25-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr18.vcf.bgz\n",
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.33.123, 142.251.211.251, 142.251.33.91, ...\n",
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.33.123|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 14430177524 (13G) [application/octet-stream]\n",
"Saving to: gnomad.genomes.v4.1.sites.chr18.vcf.bgz\n",
"\n",
"gnomad.genomes.v4.1 100%[===================>] 13.44G 72.8MB/s in 3m 19s \n",
"\n",
"2024-11-04 13:49:45 (69.0 MB/s) - gnomad.genomes.v4.1.sites.chr18.vcf.bgz saved [14430177524/14430177524]\n",
"\n",
"--2024-11-04 13:49:45-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr19.vcf.bgz\n",
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.69.219, 142.251.33.123, 142.251.211.251, ...\n",
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.69.219|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 13679473477 (13G) [application/octet-stream]\n",
"Saving to: gnomad.genomes.v4.1.sites.chr19.vcf.bgz\n",
"\n",
"gnomad.genomes.v4.1 100%[===================>] 12.74G 74.2MB/s in 3m 1s \n",
"\n",
"2024-11-04 13:52:46 (72.0 MB/s) - gnomad.genomes.v4.1.sites.chr19.vcf.bgz saved [13679473477/13679473477]\n",
"\n",
"--2024-11-04 13:52:47-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr20.vcf.bgz\n",
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.211.251, 142.251.33.91, 142.250.217.91, ...\n",
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.211.251|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 11838569571 (11G) [application/octet-stream]\n",
"Saving to: gnomad.genomes.v4.1.sites.chr20.vcf.bgz\n",
"\n",
"gnomad.genomes.v4.1 100%[===================>] 11.03G 79.9MB/s in 2m 38s \n",
"\n",
"2024-11-04 13:55:26 (71.2 MB/s) - gnomad.genomes.v4.1.sites.chr20.vcf.bgz saved [11838569571/11838569571]\n",
"\n",
"--2024-11-04 13:55:26-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr21.vcf.bgz\n",
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.211.251, 142.251.33.91, 142.250.217.91, ...\n",
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.211.251|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 7758981978 (7.2G) [application/octet-stream]\n",
"Saving to: gnomad.genomes.v4.1.sites.chr21.vcf.bgz.1\n",
"\n",
"gnomad.genomes.v4.1 100%[===================>] 7.23G 72.4MB/s in 1m 50s \n",
"\n",
"2024-11-04 13:57:16 (67.2 MB/s) - gnomad.genomes.v4.1.sites.chr21.vcf.bgz.1 saved [7758981978/7758981978]\n",
"\n",
"--2024-11-04 13:57:16-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr22.vcf.bgz\n",
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.215.251, 172.217.14.219, 142.250.69.219, ...\n",
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.215.251|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 8731339280 (8.1G) [application/octet-stream]\n",
"Saving to: gnomad.genomes.v4.1.sites.chr22.vcf.bgz\n",
"\n",
"gnomad.genomes.v4.1 100%[===================>] 8.13G 81.2MB/s in 2m 0s \n",
"\n",
"2024-11-04 13:59:16 (69.7 MB/s) - gnomad.genomes.v4.1.sites.chr22.vcf.bgz saved [8731339280/8731339280]\n",
"\n",
"--2024-11-04 13:59:17-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chrX.vcf.bgz\n",
"Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.33.123, 142.251.211.251, 142.251.33.91, ...\n",
"Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.33.123|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 22908335319 (21G) [application/octet-stream]\n",
"Saving to: gnomad.genomes.v4.1.sites.chrX.vcf.bgz\n",
"\n",
"gnomad.genomes.v4.1 100%[===================>] 21.33G 76.7MB/s in 5m 6s \n",
"\n",
"2024-11-04 14:04:23 (71.5 MB/s) - gnomad.genomes.v4.1.sites.chrX.vcf.bgz saved [22908335319/22908335319]\n",
"\n",
"--2024-11-04 14:04:23-- https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chrY.vcf.bgz\n",
"Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.14.219, 142.250.69.219, 142.251.33.123, ...\n",
"Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.14.219|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 599108494 (571M) [application/octet-stream]\n",
"Saving to: gnomad.genomes.v4.1.sites.chrY.vcf.bgz\n",
"\n",
"gnomad.genomes.v4.1 100%[===================>] 571.35M 78.3MB/s in 8.9s \n",
"\n",
"2024-11-04 14:04:32 (64.4 MB/s) - gnomad.genomes.v4.1.sites.chrY.vcf.bgz saved [599108494/599108494]\n",
"\n"
]
}
],
"source": [
"#download gnomad vcf v4\n",
"for i in chr_list: \n",
" !wget https://storage.googleapis.com/gcp-public-data--gnomad/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr{i}.vcf.bgz\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "68ba24a1-d60d-4097-8827-7599a1cc705d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"#rewrite files to only extract alleles with frequency above freq_filter\n",
"\n",
"def rewrite(chr_name, freq_filter):\n",
" file_neme = f'gnomad.genomes.v4.1.sites.chr{chr_name}.vcf.bgz'\n",
" file_male_name=f'gnomad.genomes.v4.1.sites.chr{chr_name}.male.txt'\n",
" file_female_name=f'gnomad.genomes.v4.1.sites.chr{chr_name}.female.txt'\n",
" \n",
" #delete old male female files if they exist\n",
" try:\n",
" os.remove(file_male_name)\n",
" except OSError:\n",
" pass\n",
" try:\n",
" os.remove(file_female_name)\n",
" except OSError:\n",
" pass\n",
"\n",
" with open(file_male_name, 'a') as m_file, open(file_female_name, 'a') as f_file:\n",
" m_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XY\\n')\n",
" f_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XX\\n')\n",
" with gzip.open(file_neme, 'r') as f:\n",
" for l in f:\n",
" if not str(l)[2:3]=='#':\n",
" vcf_pos = str(l)[2:-1]\n",
" pos = vcf_pos.split('\\\\t')\n",
" info = pos[-1].split(';')\n",
" pos = pos[:6]\n",
" af_XX = list(np.array(info)[[i[:5]=='AF_XX' for i in info]])\n",
" af_XY = list(np.array(info)[[i[:5]=='AF_XY' for i in info]])\n",
" #some positions don't have allele frequency\n",
" if len(af_XX)==1:\n",
" assert af_XX[0][:5]=='AF_XX'\n",
" af_XX = af_XX[0].split('=')[1] \n",
" if float(af_XX)>=freq_filter:\n",
" f_file.write((',').join(pos)+','+af_XX+'\\n')\n",
" if len(af_XY)==1:\n",
" assert af_XY[0][:5]=='AF_XY'\n",
" af_XY = af_XY[0].split('=')[1] \n",
" if float(af_XY)>=freq_filter:\n",
" m_file.write((',').join(pos)+','+af_XY+'\\n')\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2fdac0b3-f8f3-4540-b050-408672fee807",
"metadata": {},
"outputs": [],
"source": [
"freq_filter = 0.005\n",
"[rewrite(i, freq_filter) for i in chr_list]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "fb349e47-38d4-4fb2-af02-9a522c64b3fc",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"#concat all chr to one file\n",
"def concat_all_chr_to_m_f(chr_list):\n",
" file_male_name = f'gnomad.genomes.v4.1.sites.male.txt'\n",
" file_female_name = f'gnomad.genomes.v4.1.sites.female.txt'\n",
" \n",
" #delete old male female files if they exist\n",
" try:\n",
" os.remove(file_male_name)\n",
" except OSError:\n",
" pass\n",
" try:\n",
" os.remove(file_female_name)\n",
" except OSError:\n",
" pass\n",
" \n",
" with open(file_male_name, 'a') as m_file, open(file_female_name, 'a') as f_file:\n",
" m_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XY\\n')\n",
" f_file.write('#CHROM,POS,ID,REF,ALT,QUAL,AF_XX\\n')\n",
" for chr_n in chr_list:\n",
" file_male=f'gnomad.genomes.v4.1.sites.chr{chr_n}.male.txt'\n",
" with open(file_male) as infile:\n",
" next(infile)\n",
" for line in infile:\n",
" m_file.write(line)\n",
" if chr_n != 'Y':\n",
" file_female=f'gnomad.genomes.v4.1.sites.chr{chr_n}.female.txt'\n",
" with open(file_female) as infile:\n",
" next(infile)\n",
" for line in infile:\n",
" f_file.write(line)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e170368a-9738-49af-9898-ac85bbe72385",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"concat_all_chr_to_m_f(chr_list)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}