Skip to content

Commit

Permalink
store the changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Zhuohan1999 committed Sep 29, 2023
1 parent e5e49a9 commit 2f71378
Show file tree
Hide file tree
Showing 5 changed files with 121,802 additions and 22 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "3ef4e4c3-bbff-4170-9390-b51feae5efb6",
"metadata": {},
"source": [
"## Import the necessary packages"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "be874cb0-b8ff-4af7-ac51-7fd3c914291a",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import numpy as np\n",
"import MDAnalysis as mda\n",
"import sklearn.cluster as sk\n",
"import matplotlib.pyplot as plt\n",
"import copy\n",
"import subprocess\n",
"import scipy.stats\n",
"import matplotlib.patches as mpatches\n",
"import matplotlib as mpl\n",
"\n",
"mpl.rcParams['pdf.fonttype'] = 42\n",
"\n",
"plt.rcParams['font.family'] = 'Helvetica'\n",
"plt.rcParams['font.size'] = 22"
]
},
{
"cell_type": "markdown",
"id": "b5d8c8a3-795d-423e-bf4b-a55f5f0289b5",
"metadata": {},
"source": [
"We defined the important parameters in the next block."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7b3ae3b2-9dc7-4d74-a860-0b35631f3c28",
"metadata": {},
"outputs": [],
"source": [
"# The next two parameters are used when computing the DamID and TSASeq signals\n",
"sigma_tanh = 4\n",
"rc_tanh = 0.75\n",
"\n",
"N_chr_beads = 60642 #Number of chromosome particles \n",
"\n",
"first_frame = 0 #The analysis starts from the first frame, currently, 500\n",
"N_nucleolus_particles = 300 #The number of nucleolus particles\n",
"N_speckles_particles = 1600 #The number of speckle particles\n",
"N_lamina_particles = 8000 #The number of lamina particles\n",
"radius_nucleus = 13.0 #The radius of cell nucleus, 13.0 (LJ unit) = 5.0 µm\n",
"\n",
"\"\"\"Info files\"\"\"\n",
"gLength = np.loadtxt(\"mol_info/gLengthFile.txt\",dtype=int) #The length of each chromosome\n",
"maternalIdx = np.loadtxt(\"mol_info/maternalIdxFile.txt\",dtype=int) #Index of each maternal chromosome\n",
"paternalIdx = np.loadtxt(\"mol_info/paternalIdxFile.txt\",dtype=int) #Index of each paternal chromosome"
]
},
{
"cell_type": "markdown",
"id": "d5c1c187-a093-40c5-b67f-94ffc8c8806e",
"metadata": {},
"source": [
"## Compute the DamID and TSASeq\n",
"\n",
"The next block defines a function to calculate the DamID and TSASeq, and is similar to OpenNucleome/openNucleome/utility/DamID_TSASeq_calculation.py"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29222bbb-8773-4571-ac97-5756c1570785",
"metadata": {},
"outputs": [],
"source": [
"def DamID_TSASeq_calculation(traj_data):\n",
" N_frame = len(traj_data)-first_frame\n",
" exp_tsa_seq = np.zeros(N_chr_beads)\n",
" exp_damid = np.zeros(N_chr_beads)\n",
" exp_tsa_seq_all_frames = np.zeros((N_frame,N_chr_beads))\n",
" exp_damid_all_frames = np.zeros((N_frame,N_chr_beads))\n",
" N_speckle = []\n",
" \n",
" for frame_number in range(first_frame,len(traj_data),1):\n",
" chr_I_data = traj_data.trajectory[frame_number].positions[:N_chr_beads]\n",
" speckles_data = traj_data.trajectory[frame_number].positions[(N_chr_beads+N_nucleolus_particles):\n",
" (N_chr_beads+N_nucleolus_particles+N_speckles_particles)]\n",
" lamina_data = traj_data.trajectory[frame_number].positions[(N_chr_beads+N_nucleolus_particles+N_speckles_particles):]\n",
"\n",
" #Following code identifies the speckles clusters in the system and calculates and saves center of mass positions\n",
" \"\"\"Code Snippet from DBSCAN Python Page\"\"\"\n",
" \"\"\"https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py\"\"\"\n",
" db = sk.DBSCAN(eps=1.0).fit(speckles_data)\n",
" core_samples_mask = np.zeros_like(db.labels_, dtype=bool)\n",
" core_samples_mask[db.core_sample_indices_] = True\n",
" labels = db.labels_\n",
" # Number of clusters in labels, ignoring noise if present.\n",
" n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)\n",
" n_noise_ = list(labels).count(-1)\n",
" \n",
" N_speckle.append(n_clusters)\n",
"\n",
" cluster_com_master = np.zeros((n_clusters_,3))\n",
" radius_cluster_master = np.zeros(n_clusters_)\n",
" i = 0\n",
" while(i<=max(labels)):\n",
" #Go cluster by cluster\n",
" r_points_cluster = copy.deepcopy(speckles_data[labels==i])\n",
" cluster_com = np.mean(r_points_cluster,axis=0)\n",
" cluster_com_master[i] = copy.deepcopy(cluster_com)\n",
" radius_cluster_master[i] = (np.sum((r_points_cluster-cluster_com)**2,axis=None)/len(r_points_cluster))**0.5\n",
" i +=1\n",
"\n",
" for i in range(len(chr_I_data)):\n",
" ###Speckles\n",
" distances_from_speckles = np.sum((cluster_com_master-chr_I_data[i])**2,axis=1)**0.5\n",
" distances_from_speckles -= radius_cluster_master\n",
"\n",
" #Do this only for speckles\n",
" exp_tsa_seq_all_frames[frame_number-first_frame,i] = \n",
" np.sum(0.5*(1.0+np.tanh(sigma_tanh*(rc_tanh-distances_from_speckles))),axis=None)\n",
" #exp_tsa_seq will be normalized by total (itself) while getting enrichment and so it does not matter if I divide by number of speckles\n",
" #But have to divide exp_tsa_seq_all_frames by number of speckles so that the values can be interpretted as probabilities\n",
" #when calculating correlations\n",
" exp_tsa_seq_all_frames[frame_number-first_frame,i] /= float(n_clusters_)\n",
" exp_tsa_seq[i] += copy.deepcopy(exp_tsa_seq_all_frames[frame_number-first_frame,i])\n",
"\n",
" ###Lamina\n",
" distances_from_lamina = np.sum((lamina_data-chr_I_data[i])**2,axis=1)**0.5\n",
"\n",
" #Do this only for lamina\n",
" exp_damid_all_frames[frame_number-first_frame,i] = \n",
" np.sum(0.5*(1.0+np.tanh(sigma_tanh*(rc_tanh-distances_from_lamina))),axis=None)\n",
" exp_damid[i] += copy.deepcopy(exp_damid_all_frames[frame_number-first_frame,i])\n",
"\n",
"\n",
" damid_all_frames_haploid = np.zeros((N_frame,int(N_chr_beads/2)))\n",
" tsaseq_all_frames_haploid = np.zeros((N_frame,int(N_chr_beads/2)))\n",
" for i in range(23):\n",
" damid_all_frames_haploid[:,gLength[i]:gLength[i+1]] = \n",
" 0.5*(exp_damid_all_frames[:,maternalIdx[i][0]-1:maternalIdx[i][1]]\n",
" +exp_damid_all_frames[:,paternalIdx[i][0]-1:paternalIdx[i][1]])\n",
" tsaseq_all_frames_haploid[:,gLength[i]:gLength[i+1]] = \n",
" 0.5*(exp_tsa_seq_all_frames[:,maternalIdx[i][0]-1:maternalIdx[i][1]]\n",
" +exp_tsa_seq_all_frames[:,paternalIdx[i][0]-1:paternalIdx[i][1]])\n",
" \n",
" \n",
" return (np.sum(damid_all_frames_haploid,axis=0), np.sum(tsaseq_all_frames_haploid,axis=0), N_speckle) "
]
},
{
"cell_type": "markdown",
"id": "f1056cbd",
"metadata": {},
"source": [
"In the next step, we run a small sample (10 frames from the simulation) with the function above and plotted the simulated DamID and TSASeq against experimental results."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd8eec86",
"metadata": {},
"outputs": [],
"source": [
"damid_simulated, tsaseq_simulated, N_spec_clusters = DamID_TSASeq_calculation(\"DUMP_FILE.dcd\")\n",
"\n",
"damid_data_low_res = np.loadtxt(\"mol_info/DAM-ID_HFF_100KB.txt\",usecols=[1])\n",
"tsa_data_low_res = np.loadtxt(\"mol_info/TSA-Seq_SON_HFF_100KB.txt\",usecols=[1])\n",
"\n",
"\n",
"chr_choose = 7\n",
"dpi_ = 3000\n",
"bin_width_ = 0.8\n",
"yaxis_labels = [\"DamID\\n(Model)\",\"DamID\\n(Expt)\",\"TSA-Seq\\n(Model)\",\"TSA-Seq\\n(Expt)\"]\n",
"subplot_dict = dict({\"L\":0,\"S\":2})\n",
"\n",
"x_axis = np.linspace(0,gLength[chr_choose]-gLength[chr_choose-1]-1,\n",
" gLength[chr_choose]-gLength[chr_choose-1],dtype=int)\n",
"fig1, axs1 = plt.subplots(4,figsize=(9,6.3),sharex=True,gridspec_kw={'height_ratios':[1,1,1,1]},dpi=dpi_)\n",
"\n",
"plt.subplots_adjust(hspace=0)\n",
"axs1[0].set_title('Chromosome 7')\n",
"\n",
"damid_data_low_res_chr = copy.deepcopy(damid_data_low_res[paternalIdx[chr_disp-1][0]-1\n",
" :paternalIdx[chr_disp-1][1]])\n",
"tsa_data_low_res_chr = copy.deepcopy(tsa_data_low_res[paternalIdx[chr_disp-1][0]-1\n",
" :paternalIdx[chr_disp-1][1]])\n",
"damid_data_low_res_chr_positive = damid_data_low_res_chr>=0.0\n",
"tsa_data_low_res_positive = tsa_data_low_res_chr>=0.0\n",
"\n",
"for nuclear_body in [\"L\",\"S\"]:\n",
"\n",
" if nuclear_body == 'L':\n",
" laf_chr = damid_simulated[gLength[chr_choose-1]:gLength[chr_choose]]\n",
" else:\n",
" laf_chr = tsaseq_simulated[gLength[chr_choose-1]:gLength[chr_choose]]\n",
"\n",
" laf_chr_OE = (laf_chr/(np.mean(laf_chr[:])))\n",
"\n",
" log2_laf_chr = np.zeros(len(laf_chr_OE))\n",
" non_zero_indices = (laf_chr_OE!=0.0)\n",
" log2_laf_chr[non_zero_indices] = np.log2(laf_chr_OE[non_zero_indices])\n",
"\n",
" enriched_indices = log2_laf_chr>=0.0\n",
" axs1[subplot_dict[nuclear_body]].bar(x_axis[enriched_indices],log2_laf_chr[enriched_indices],\n",
" width=bin_width_,color=\"orange\",alpha=0.8)\n",
" axs1[subplot_dict[nuclear_body]].bar(x_axis[~enriched_indices],log2_laf_chr[~enriched_indices],\n",
" width=bin_width_,color=\"blue\",alpha=0.8)\n",
" axs1[subplot_dict[nuclear_body]].set_ylim([-2.0,2.0])\n",
" axs1[subplot_dict[nuclear_body]].set_xlim([0,x_axis[-1]])\n",
" axs1[subplot_dict[nuclear_body]].spines['top'].set_linewidth('2.0')\n",
" axs1[subplot_dict[nuclear_body]].spines['right'].set_linewidth('2.0')\n",
" axs1[subplot_dict[nuclear_body]].spines['left'].set_linewidth('2.0')\n",
" axs1[subplot_dict[nuclear_body]].spines['bottom'].set_linewidth('2.0')\n",
" \n",
"axs1[1].bar(x_axis[damid_data_low_res_chr_positive ],\n",
" damid_data_low_res_chr[damid_data_low_res_chr_positive],color='orange',width=bin_width_,alpha=0.8)\n",
"axs1[1].bar(x_axis[~damid_data_low_res_chr_positive ],\n",
" damid_data_low_res_chr[~damid_data_low_res_chr_positive],color='blue',width=bin_width_,alpha=0.8)\n",
"axs1[1].set_ylim([-2.0,2.0])\n",
"axs1[3].bar(x_axis[tsa_data_low_res_positive],\n",
" tsa_data_low_res_chr[tsa_data_low_res_positive],color='orange',width=bin_width_, alpha=0.8)\n",
"axs1[3].bar(x_axis[~tsa_data_low_res_positive],\n",
" tsa_data_low_res_chr[~tsa_data_low_res_positive],color='blue',width=bin_width_, alpha=0.8)\n",
"axs1[3].set_ylim([-2.0,2.0])\n",
"axs1[3].spines['top'].set_linewidth('2.0')\n",
"axs1[3].spines['right'].set_linewidth('2.0')\n",
"axs1[3].spines['left'].set_linewidth('2.0')\n",
"axs1[3].spines['bottom'].set_linewidth('2.0')\n",
"axs1[1].spines['top'].set_linewidth('2.0')\n",
"axs1[1].spines['right'].set_linewidth('2.0')\n",
"axs1[1].spines['left'].set_linewidth('2.0')\n",
"axs1[1].spines['bottom'].set_linewidth('2.0')\n",
"\n",
"for i in range(4):\n",
" axs1[i].set_ylabel(yaxis_labels[i],fontsize=22)\n",
"axs1[3].set_xlabel(\"Genomic position (100KB)\",fontsize=24)\n",
"\n",
"location_array = axs1[0].get_position().get_points()\n",
"x_legend,y_legend = location_array[0]+0.5*(location_array[1]-location_array[0])\n",
"\n",
"y_legend += 1.5 \n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "15f5efce-885f-4716-80fa-7e66a8333457",
"metadata": {},
"source": [
"## Compute the chromosome contact probabilities"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7dd7e0df-e4aa-491f-842a-65f44879c955",
"metadata": {},
"outputs": [],
"source": [
"subprocess.call([\"gfortran -o chromsome_contact_calculation ../../openNucleome/utility/chromsome_contact_calculation.f90\"],shell=True,stdout=subprocess.PIPE)\n",
"subprocess.call([\"./chromsome_contact_calculation DUMP_FILE.dcd 1 -1 ./contact_prob/ 1 counter.txt\"],shell=True,stdout=subprocess.PIPE)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 2f71378

Please sign in to comment.