MACARON

#!/usr/bin/python

## Version 1.0 (2020-05-12)

"""
#Script to identify SNPs located within a genetic codon:

Annotation of SNPs (including synonymous and non-synonymous variants) located within the same genetic codon requires attention. This idea conflicts with the annotations observed by traditional annotation software widely used now a days. While looking at the combined effect within the framework of genetic codon, we have new / altered codons that code for new amino acid that can be predicted by using this MACARON python script.

#####------How to run quickly run MACARON-------

# python MACARON -i yourinputfile.vcf

"""

# NEWS:
# * GATK wrapper only (guidelines from GTAK 4). Compatible with latest GATK4 (4.1.7)
# * JAVA_OPTIONS allows to parameter java arguments to GATK and snpEff
# * snpEff wrapper (available from bioconda) can be used
# * MACARON will recognize snpEff jar/wrapper by identifying the '.jar' extension 
# * Fixed bug with grep: MACARON is now working on MacOS
# * Global refactoring, no longer uses classes

## Library imports

import sys, os, time
import itertools
import multiprocessing
import re
import subprocess
from argparse import ArgumentParser


## GLOBAL VARIABLES (IMPORTANT: You can set the default values here)
## ----------------

GATK = ""
SNPEFF = ""
HG_REF = "/path/to/RefGenom/hg19.fa"
SNPEFF_HG = "GRCh37.75" 
JAVA_OPTIONS = "-Xmx4g" 


## PRINTINGS, AESTHETICS
## ---------------------

str_progress_list = ["\tIndexing VCF file",
		     "\tIdentifying SnpClusters",
		     "\tExtracting SnpClusters",
		     "\tAnnotating SnpClusters",
		     "\tExcluding InDels",
		     "\tGenerating a SnpCluster Table",
		     "\tRe-annotating Codons",
		     "\tRemoving SnpCluster if AA_Change_pcSNV == (AA1 or AA2)",
		     "\tExtracting established SnpClusters"]
header = ("\n" +
	  "(###############)\n" +
	  "@@@@ MACARON @@@@\n" +
	  "(###############)\n\n" +
	  "Starting....\n")
footer = "\nMACARON Run Completed. Bon Courage with Analysis ...,,,!!!!\n"


## UTILS
## -----

# Functions for animation
def animate(keep_anim, idx):
	c = "|/-\\"
	i=0
	while keep_anim.is_set():
		i += 1
		sys.stdout.write("\r>" + str_progress_list[idx] + ": " + c[i % len(c)] +"\r")
		sys.stdout.flush()
		time.sleep(0.1)
	sys.stdout.write("\r " + str_progress_list[idx] + ": Done!\n")
def print_step(keep_anim, idx):
	if ANIM:
		keep_anim.set()
		anim_thread = multiprocessing.Process(target=animate, args=(keep_anim, idx))
		anim_thread.start()
		return anim_thread
	else:
		sys.stdout.write(">" + str_progress_list[idx] + ": in progress...\r" )
		sys.stdout.flush()
		return None
def end_print_step(keep_anim, anim_thread, idx):
	if ANIM:
		keep_anim.clear(); anim_thread.join()
	else:
		sys.stdout.write(" " + str_progress_list[idx] + ": Done!            \n")
		sys.stdout.flush()

# Find if a program is accessible via PATH
def which(program):
	def is_exe(program):
		return os.path.isfile(program) and os.access(program, os.X_OK)
	def is_jar(program):
		return os.path.isfile(program) and program.endswith('.jar')
	fpath, fname = os.path.split(program)
	if fpath:
		if is_exe(program) or is_jar(program):
			return program
	else:
		for path in os.environ["PATH"].split(os.pathsep):
			exe_file = os.path.join(path, program)
			if is_exe(exe_file):
				return exe_file
	return None

# If no cluster of SNP found, end analysis with a message. 
def end_macaron(message):
	print(message)
	print(footer)
	if not(KEEP_TMP):
		subprocess.check_output("rm -r " + TMPDIR, shell=True)
	sys.exit(0)

# Check if vcf contains entries
def check_vcf_empty(file):
	empty = True
	with open(file, 'r') as vcf:
		lines = vcf.readlines()
		for l in lines:
			if not(l.startswith('#')) and len(l) > 0:
				empty = False
	if empty:
		end_macaron("\nMACARON did not find any cluster of SNP in your VCF.\n")

# Translate a single codon
def translate_codon(codon):
	gencode = {'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T','AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R','CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P','CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R','GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A','GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G','TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L','TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'}
	return gencode.get(codon.upper(), '#')

# Split a sequence into codons
def split_into_codons(dna, frame):
	codons = []
	for i in range(frame-1, len(dna)-2, 3):
		codon = dna[i:i+3]
		codons.append(codon)
	return codons

# Translate a dna sequence in a single frame
def translate_dna_single(dna, frame=1):
	codons = split_into_codons(dna, frame)
	amino_acids = ''
	for codon in codons:
		amino_acids = amino_acids + translate_codon(codon)
	return amino_acids


## MAIN FUNCTIONS 
## --------------

def Search_CODON(vcf_input, TMPDIR, GATK, SNPEFF, SNPEFF_HG, JAVA_OPTIONS, FIELDS, GATK4_alt):
	"""
	Calling SNPClusters

	USAGE INSTRUCTIONS:	Full path to the software directories should be set before compiling.
	"""

	#######################
	Fld_Len = len(FIELDS.split(","))
	FldwithF = " ".join(["-F " + str(x) for x in FIELDS.split(",")])

#	if GATK4: # version >= 4.1.4.1
	GATK_verbose, SNPeff_verbose = ('', ' ') if VERBOSE else ('--QUIET --verbosity ERROR', ' ')
	GATK_java = '--java-options "{}"'.format(JAVA_OPTIONS) if JAVA_OPTIONS != '' else ''
	GATK = '{} {}'.format(GATK, GATK_java)
	if not(GATK4_alt):
		cmd0 = "{} IndexFeatureFile -I {} {}".format(GATK, vcf_input, GATK_verbose)
	else: ## this command changed after gatk 4.1.4.1
		cmd0 = "{} IndexFeatureFile -F {} {}".format(GATK, vcf_input, GATK_verbose)
	cmd1 = "{} VariantFiltration -R {} -V {} -O {}{} --cluster-size 2 --cluster-window-size 3 {}".format(GATK, HG_REF, vcf_input, TMPDIR, "snp_clsters2_ws3.vcf", GATK_verbose)
	cmd2 = "{} SelectVariants -R {} -V {}{} -O {}{} -select 'FILTER == SnpCluster' {}".format(GATK, HG_REF, TMPDIR, "snp_clsters2_ws3.vcf", TMPDIR, "snp_clsters2_ws3_clstronly.vcf", GATK_verbose)
	if SNPEFF.endswith('.jar'):
		cmd3 = "java {} -jar {} {} {} -formatEff -lof -classic {}{} > {}{}".format(JAVA_OPTIONS, SNPEFF, SNPeff_verbose, SNPEFF_HG, TMPDIR, "snp_clsters2_ws3_clstronly.vcf", TMPDIR, "snp_clsters2_ws3_clstronly_annt.vcf")
	else:
		cmd3 = "{} {} {} {} -formatEff -lof -classic {}{} > {}{}".format(SNPEFF, JAVA_OPTIONS, SNPeff_verbose, SNPEFF_HG, TMPDIR, "snp_clsters2_ws3_clstronly.vcf", TMPDIR, "snp_clsters2_ws3_clstronly_annt.vcf")

	cmd4 = "{} SelectVariants -R {} -V {}{} -O {}{} --select-type-to-include SNP {}".format(GATK, HG_REF, TMPDIR, "snp_clsters2_ws3_clstronly_annt.vcf", TMPDIR, "snp_clsters2_ws3_clstronly_annt_snv.vcf", GATK_verbose)
	cmd5 = "{} VariantsToTable -R {} -V {}{} -F CHROM -F POS -F ID -F REF -F ALT -F EFF {}  -GF GT --error-if-missing-data --show-filtered -O {}{} {}".format(GATK, HG_REF, TMPDIR, "snp_clsters2_ws3_clstronly_annt_snv.vcf", FldwithF, TMPDIR, "snp_clsters2_ws3_clstronly_annt_snv_clstronly.table", GATK_verbose)

	thread = print_step(keep_anim, 0)
	subprocess.check_output(cmd0, shell=True)
	end_print_step(keep_anim, thread, 0)

	thread = print_step(keep_anim, 1)
	subprocess.check_output(cmd1, shell=True)
	end_print_step(keep_anim, thread, 1)
	thread = print_step(keep_anim, 2)
	subprocess.check_output(cmd2, shell=True)
	end_print_step(keep_anim, thread, 2)
	## If vcf empty : no cluster found, MACARON end
	check_vcf_empty(TMPDIR + 'snp_clsters2_ws3_clstronly.vcf')

	thread = print_step(keep_anim, 3)
	subprocess.check_output(cmd3, shell=True)
	end_print_step(keep_anim, thread, 3)
	thread = print_step(keep_anim, 4)
	subprocess.check_output(cmd4, shell=True)
	end_print_step(keep_anim, thread, 4)
	thread = print_step(keep_anim, 5)
	subprocess.check_output(cmd5, shell=True)
	end_print_step(keep_anim, thread, 5)

	subprocess.check_output("rm snpEff_genes.txt", shell=True)
	subprocess.check_output("rm snpEff_summary.html", shell=True)

	def Change_zygo(ref, alt, zyg):
		"""
		program to convert zygosity code ref/alt to 0/1.
		Input Variables:

		ref = "A";alt = "G"
		zyg =  ['A/G', 'G/A', 'G/G', 'A/A', './.', 'G/.', './G', './A', 'A/.']
		chzyg = ['0/1', '1/0', '1/1', '0/0', './.', '1/.', './1', './0', '0/.']
		InputUsage Variables:
		chgz = Change_zygo(ref, alt, zyg)
		"""

		import re
		chg_zyg = {}; i = 1
		for cs in zyg:
			csp = re.split('[|/]', cs)
			if ((ref == csp[0]) and (ref == csp[1]) and ((csp[0] != ".") and (csp[1] != "."))):
				chg_zyg[i] = "0/0"
			elif ((ref != csp[0]) and (ref == csp[1]) and ((csp[0] != ".") and (csp[1] != "."))):
				chg_zyg[i] = "1/0"
			elif ((ref == csp[0]) and (ref != csp[1]) and ((csp[0] != ".") and (csp[1] != "."))):
				chg_zyg[i] = "0/1"
			elif ((ref != csp[0]) and (ref != csp[1]) and ((csp[0] != ".") and (csp[1] != "."))):
				chg_zyg[i] = "1/1"
			elif ((csp[0] == ".") and (csp[1] == ".")):
				chg_zyg[i] = cs
			elif ((csp[1] == ".")):
				if (ref == csp[0]):
					chg_zyg[i] = "0/."
				elif (ref != csp[0]):
					chg_zyg[i] = "1/."
			elif ((csp[0] == ".")):
				if (ref == csp[1]):
					chg_zyg[i] = "./0"
				elif (ref != csp[1]):
					chg_zyg[i] = "./1"
			i += 1
		return list(chg_zyg.values())

	with open(TMPDIR + "snp_clsters2_ws3_clstronly_annt_snv_clstronly.table", 'r') as f1, open(TMPDIR + "temp_file1", 'w') as output:
		first_line = f1.readline().strip()
		zyg_head = '\t'.join(first_line.split()[6+Fld_Len:])
		output.write(first_line + "\t" + zyg_head + "\t" + str("Protein_coding_EFF\tAA-Change\tREF-codon\tALT-codon") + "\n")
		for line in f1:
			line1 = line.strip()
			line_TAB = line1.split("\t")
			line_EFF = line_TAB[5].split("|")
			if (len(line_EFF) > 1):
				if ((line_EFF[1] == "SILENT") or (line_EFF[1] == "MISSENSE") or (line_EFF[1] == "NONSENSE")):
					linesp = line_EFF[2].split("/")
					ref = line_TAB[3]
					alt = line_TAB[4]	
					zyg = line_TAB[6+Fld_Len:]
					chgz = Change_zygo(ref, alt, zyg)
					chgz_out = '\t'.join(chgz)
					wrt = str(line_EFF[1] + "\t" + line_EFF[3] + "\t" + linesp[0] + "\t" + linesp[1])
					output.write(line1 + "\t" + chgz_out + "\t" + wrt + "\n")
	with open(TMPDIR + "temp_file1", 'r') as handle:
		if len(handle.readlines()) == 1:
			end_macaron("\nMACARON did not find any cluster of coding SNP in your VCF.\n")
			
	
def TWO_VAR(TMPDIR):
	"""
	###---Two variants (2VAR) codon changes---
	##----------------------------------
	"""
	lines =	 open(TMPDIR + "temp_file1", "r").read().splitlines()
	writ2 = open(TMPDIR + "temp_file2", 'w')

	midline_head =	lines[0].strip().split("\t")
	try:
		midline_headcrp = '\t'.join([w.replace(midline_head[5], 'Gene_Name') for w in midline_head])
	except ValueError:
		pass
	writ2.write(str(midline_headcrp + "\t" + "ALT-codon_merge-2VAR" + "\t" + "AA-Change-2VAR") + "\n")

	i=0; TRcode =""; protcode = ""; midline_crpit = ""
	for i in range(len(lines)):
		midline_crp =  lines[i].strip().split("\t")
		try:
			if len(midline_crp[5].split("|")) != 1:
				GeneName = midline_crp[5].split("|")[5]
				midline_crpit = '\t'.join([w.replace(midline_crp[5], GeneName) for w in midline_crp])
		except ValueError:
			pass
		try:
			beforeline = lines[i-1].strip()
			line0 = beforeline.split("\t")[-3]
			beforeline1 = re.findall("\d+", line0)

			midline = lines[i].strip()
			line1 = midline.split("\t")[-3]
			midline1 = re.findall("\d+", line1)

			nextline = lines[i+1].strip()
			line2 = nextline.split("\t")[-3]
			nextline1 = re.findall("\d+", line2)
			#----Condition to replace empty list ([] to ['000']) produced from the header column "AA-Change". 
			if (line0 == "AA-Change"):
				beforeline1.append('000')
			elif (line1 == "AA-Change"):
				midline1.append('000')
			elif (line2 == "AA-Change"):
				nextline1.append('000')

			REFbf=[]; line22=""
			if ((beforeline1[0] == midline1[0]) or (midline1[0] == nextline1[0])):
				spREFbf=[]; lscod1=[]; lscod2=[]
				REF= lines[i].strip().split("\t")[-2]
				if (midline1[0] == beforeline1[0]):
					REFbf = lines[i-1].strip().split("\t")[-2]

				line11 = lines[i].strip().split("\t")[-1]
				if (midline1[0] == nextline1[0]):
					line22 = lines[i+1].strip().split("\t")[-1]
				if REFbf != []:
					for cod in REFbf:
						spREFbf.append(cod)
				for cod in line11:
					lscod1.append(cod)
				for cod in line22:
					lscod2.append(cod)

				if (((lscod1[0].islower() and lscod2[0].islower()) or (lscod1[1].islower() and lscod2[1].islower()) or (lscod1[2].islower() and lscod2[2].islower())) and ((lscod1[0] == lscod2[0]) or (lscod1[1] == lscod2[1]) or (lscod1[2] == lscod2[2]))):
					threeltr_code = []
					if ((lscod1[0].isupper() and lscod2[0].islower()) or (lscod1[0] == lscod2[0])):
						threeltr_code.append(lscod1[0])
					elif((lscod1[0].islower() and lscod2[0].isupper()) or (lscod1[0] == lscod2[0])):
						threeltr_code.append(lscod2[0])

					if((lscod1[1].isupper() and lscod2[1].islower()) or (lscod1[1] == lscod2[1])):
						threeltr_code.append(lscod1[1])
					elif((lscod1[1].islower() and lscod2[1].isupper()) or (lscod1[1] == lscod2[1])):
						threeltr_code.append(lscod2[1])

					if((lscod1[2].isupper() and lscod2[2].islower()) or (lscod1[2] == lscod2[2])):
						threeltr_code.append(lscod1[2])
					elif((lscod1[2].islower() and lscod2[2].isupper()) or (lscod1[2] == lscod2[2])):
						threeltr_code.append(lscod2[2])

					if(len(threeltr_code)==3):
						TRcode = ''.join(threeltr_code)
					else:
						TRcode = 'Multiallelic-t1'

				else:
					TRcode = '...'

				if(TRcode != line22):
					protcode = translate_dna_single(TRcode)
				else:				
					TRcode = "."

				if (REF == REFbf):
					protcode = "#####";TRcode = "Multiallelic-t2"	
				writ2.write(str(midline_crpit + "\t" + TRcode + "\t" + protcode) + "\n")

		except IndexError:
			if not midline_crpit.split("\t")[0] == "CHROM":
				writ2.write(str(midline_crpit + "\t" + "." + "\t" + protcode) + "\n")
			pass

def THREE_VAR(TMPDIR):
	"""
	###---Three variants (3VAR) codon changes---
	##----------------------------------
	"""
	lines =	open(TMPDIR + "temp_file2", "r").read().splitlines()
	writ3 = open(TMPDIR + "temp_file3", 'w')

	midline_head =lines[0].strip()
	writ3.write(str(midline_head + "\t" + "ALT-codon_merge-3VAR" + "\t" + "AA-Change-3VAR") + "\n")

	i = 0; TRcode = ""; protcode = ""
	for i in range(len(lines)):
		try:
			midline_crp = lines[i].strip()
			beforeline = lines[i-1].strip()
			line0 = beforeline.split("\t")[-5]
			beforeline1 = re.findall("\d+", line0)
			midline = lines[i].strip()
			line1 = midline.split("\t")[-5]
			midline1 = re.findall("\d+", line1)
			nextline = lines[i+1].strip()
			line2 = nextline.split("\t")[-5]
			nextline1 = re.findall("\d+", line2)

			line11=[]; line22=[]; writelst= []
			if ((beforeline1[0] == midline1[0]) or (midline1[0] == nextline1[0])):
				lscod1=[]; lscod2=[]
				line11 = lines[i].strip().split("\t")[-2]
				if (midline1[0] == nextline1[0]):
					line22 = lines[i+1].strip().split("\t")[-2]
				for cod in line11:
					lscod1.append(cod)
				for cod in line22:
					lscod2.append(cod)
				if(len(lscod1) == 3 and len(lscod2) == 3):
					threeltr_code = [];

					if ((lscod1[0].isupper()) and (lscod2[0].islower()) or
					    (lscod1[0].isupper() and lscod2[0].isupper())):
						threeltr_code.append(lscod1[0])
					elif ((lscod1[0].islower()) and (lscod2[0].isupper())):
						threeltr_code.append(lscod2[0])

					if ((lscod1[1].isupper()) and (lscod2[1].islower()) or
					    (lscod1[1].isupper() and lscod2[1].isupper())):
						threeltr_code.append(lscod1[1])
					elif ((lscod1[1].islower()) and (lscod2[1].isupper())):
						threeltr_code.append(lscod2[1])

					if ((lscod1[2].isupper()) and (lscod2[2].islower()) or
					    (lscod1[2].isupper() and lscod2[2].isupper())):
						threeltr_code.append(lscod1[2])
					elif ((lscod1[2].islower()) and (lscod2[2].isupper())):
						threeltr_code.append(lscod2[2])

					if(len(threeltr_code) == 3):
						TRcode = ''.join(threeltr_code)
					else:
						TRcode = 'Multiallelic-t1'

				else:
					TRcode = '.'

				if(TRcode != line22):
					protcode = translate_dna_single(TRcode)
					if len(protcode) == 0:
						protcode = "."; TRcode = "."
				else:
					TRcode = "."

				if protcode == "" or TRcode == "":
					protcode = "."; TRcode = "."

				writ3.write(str(midline_crp + "\t" + TRcode + "\t" + protcode) + "\n")

		except IndexError:
			if not midline_crp.split("\t")[0] == "CHROM":
				if len(protcode) ==0:
					protcode = "."; TRcode = "."
					writ3.write(str(midline_crp + "\t" + TRcode + "\t" + protcode) + "\n")
				else:
					protcode = "."; TRcode = "."
					writ3.write(str(midline_crp + "\t" + TRcode + "\t" + protcode) + "\n")
			pass

def PARS_OUT_VAR(TMPDIR):
	"""
	###---Pars variants (2VAR _ 3VAR) based on change of protein codons ---
	##----------------------------------
	"""
	#This first records the ones that do match and saves the value in column 3 (less the final matching amino acid residue).

	#Checks if the new amino acid (predicted from multiple variants) is different from the one predicted by independent variants. If this is not the case, the concerned variations are stored in matches.list (format XY where X is reference AA and Y is position, as defined in column AA-change in (NF-6)).

	subprocess.check_output("awk 'index($(NF-6), $(NF-2)) {print}' " + TMPDIR + "temp_file3 | awk '{print $(NF-6)}' | sed s'/.$//' > " + TMPDIR + "matches.list", shell=True)

	# Then modify temp_file3 : all the lines with AA ref having a match in match.list are removed.
	if os.stat(TMPDIR + 'matches.list').st_size > 0:
		subprocess.check_output("awk '!index($(NF-6), $(NF-2)) { print }' " + TMPDIR + "temp_file3 | grep -w -v -f " + TMPDIR + "matches.list | awk -F'\t' '{ print }' > " + TMPDIR + "temp_file4", shell=True)
	else:
		subprocess.check_output("cp {}{} {}{}".format(TMPDIR, 'temp_file3', TMPDIR, 'temp_file4'), shell=True)

def ZYGO_PAIR(macaron_output, TMPDIR, FIELDS):
	"""
	###---Pair of zygosity check: remove pair of codons for which one SNP is reference homozygous ---
	##----------------------------------
	"""

	Fld_Len = int(len(FIELDS.split(",")))
	lines =	open(TMPDIR + "temp_file4", "r").read().splitlines()
	writ4 = open(macaron_output, 'w')

	midline_head = lines[0].strip()
	writ4.write(str(midline_head) + "\n")

	nextline_pos_lst = []

	for i in range(len(lines)):
		midline_crp = lines[i].strip()
		beforeline = lines[i-1].strip()
		beforeline_pos = re.findall("\d+", beforeline.split("\t")[-7])
		midline = lines[i].strip()
		midline_pos = re.findall("\d+", midline.split("\t")[-7])
		try:
			nextline = lines[i+1].strip()
			nextline_pos =re.findall("\d+", nextline.split("\t")[-7])
			if (midline_pos[0] == nextline_pos[0]):
				Start_lns = int(6) + Fld_Len
				End_lns = int(8)
				Start_zyg_lns = int(len(midline.split("\t")) - int(Start_lns))
				midline_zyg = midline.split("\t")[-int(Start_zyg_lns):-int(End_lns)]
				nextline_zyg = nextline.split("\t")[-int(Start_zyg_lns):-int(End_lns)]
				checkList = list(['0/1:1/0', '1/0:0/1', '1/0:1/0', '0/1:0/1', '1/0:1/1', '1/1:1/0', '0/1:1/1', '1/1:0/1', '1/1:1/1'])
				Mergetwozyg = ','.join([str(a) + ":" + b for a,b in zip(midline_zyg, nextline_zyg)])
				Mergetwozygsp = Mergetwozyg.split(",")
				if set(Mergetwozygsp).intersection(checkList) != set([]):
					if ((midline_pos[0] == nextline_pos[0]) and (midline_pos[0] == beforeline_pos[0])):
						writ4.write(str(nextline)+"\n")
					else:
						writ4.write(str(midline+"\n"+nextline)+"\n")
		except IndexError:
			#Condition to remove the duplicated line at the end and prints only paired lines.
			nextline_pos_lst.append(nextline_pos[0])
			result = dict((i, nextline_pos_lst.count(i)) for i in nextline_pos_lst)
			if len(result) == 1:
				writ4.write(str(nextline) + "\n")

## MACARON MAIN
if __name__ == "__main__":

	## Parsing arguments
	parser = ArgumentParser(description="-Script to identify SnpClusters (SNPs within the same genetic codon)")
	parser.add_argument("-i", "--infile", dest="INPUTFile",default=False, required=True, help="Full path of the input VCF file.")
	parser.add_argument("-o", "--outfile", dest="OUTPUTFile",default="./MACARON_output.txt", required=False, help="Path of the output txt file (Default Output file: MACARON_output.txt)")
	parser.add_argument("-f", "--fields", dest="Fields", default="QUAL", required=False, help=" Single field name or comma-seperated ',' multiple field names can be given. Field name should be given according to the (INFO) field header of the input vcf file. Example: -f Func.refGene,ExonicFunc.refGene,Gene.refGene,1000g2015aug_all,ExAC_ALL,ExAC_EAS,clinvar_20161128,gnomAD_exome_ALL,gnomAD_genome_ALL,EFF,CSQ")
	parser.add_argument("--HG_REF", dest="HG_REF_path", default=HG_REF, required=False, help="Indicate the full path to the reference genome fasta file")
	parser.add_argument("--GATK", dest="GATK_path", default=GATK, required=False, help="You can use this option to directly indicate the full path to the GATK program (gatk wrapper or .jar)")
	parser.add_argument("--JAVA_OPTIONS", dest="JAVA_OPTIONS", default=JAVA_OPTIONS, required=False, help='You can use this option to specify java arguments required by GATK (default: --JAVA-OPTIONS "-Xmx4G")')
	parser.add_argument("--SNPEFF", dest="SNPEFF_path", default=SNPEFF, required=False, help="You can use this option to directly indicate the full path to the snpEff jar or wrapper")
	parser.add_argument("--SNPEFF_HG", dest="SNPEFF_HG_version", default=SNPEFF_HG, required=False, help="Indicate SnpEff human genome annotation database version")
	parser.add_argument("-v", "--verbosity", dest="Verbosity", default=False, required=False, action='store_true', help="Use to print verbosity (Mostly GATK/SNPEFF output)")
	parser.add_argument("-c", "--add_anim", dest="ANIM", default=False, required=False, action='store_true', help="Add animation while running (looks good but costs a thread)")
	parser.add_argument("--keep_tmp", dest="KEEP_TMP", default=False, required=False, action='store_true', help="Keep temporary files in the directory tmp_macaron, at the same location than the output file.")
	parser.add_argument("--gatk4_previous", dest="GATK4_alt", default=False, required=False, action='store_true', help="Use this option if you are using a version of gatk 4 older than gatk 4.1.4.1")


	## Assign arguments to global variables
	args = parser.parse_args()
	FIELDS = args.Fields
	VERBOSE = args.Verbosity
	HG_REF = args.HG_REF_path
	GATK = args.GATK_path
	JAVA_OPTIONS = args.JAVA_OPTIONS
	SNPEFF = args.SNPEFF_path
	SNPEFF_HG = args.SNPEFF_HG_version
	ANIM = args.ANIM
	KEEP_TMP = args.KEEP_TMP
	GATK4_alt = args.GATK4_alt

	## Inputs / Outputs path & names
	INF = args.INPUTFile
	OUTF = args.OUTPUTFile
	TMPDIR = os.path.dirname(os.path.abspath(OUTF)) + "/macaron_tmp/"
	subprocess.check_output("mkdir -p " + TMPDIR, shell=True)

	########################
	##    MAIN PROCESS    ##
	########################
	print(header)
	
	## Check if global variables are well defined and point to existing files:
	INF_check = os.path.exists(INF)
	HG_REF_check = os.path.exists(HG_REF)
	SNPEFF_HG_non_empty = (SNPEFF_HG != "")
	if not (INF_check and HG_REF_check and SNPEFF_HG_non_empty):
		print(">ERROR : One of the following variable is not defined: \n  VCF (input file .vcf) = {} > {}\n  HG_REF (reference genome .fa) = {} > {}\n  SNPEFF_HG (snpEff reference (ex:'GRCh37.75')) = {} > {}\n>Please correct and try again.\n".format(INF, INF_check, HG_REF, HG_REF_check, SNPEFF_HG, SNPEFF_HG_non_empty))
		sys.exit(1)

	## Check if gatk and snpEff exist:
	GATK_check = which(GATK) != None
	if not GATK_check:
		if which('gatk') != None:
			GATK = "gatk"
			GATK_check = True
		else:
			print(">ERROR : Could not find GATK : {}\n>Please correct and try again!".format(GATK))
			sys.exit(1)
	SNPEFF_check = which(SNPEFF) != None
	if not SNPEFF_check: 
		if which('snpEff') != None:
			SNPEFF = "snpEff"
			SNPEFF_check = True
		else:
			print(">ERROR : Could not find SNPEFF : {}\n>Please correct and try again!".format(SNPEFF))
			sys.exit(1)

	## Animation event initialization
	keep_anim = multiprocessing.Event()
	
	## 1)VARIANTS FILTERING, ANNOTATION (GATK,SNPEff)
	Search_CODON(INF, TMPDIR, GATK, SNPEFF, SNPEFF_HG, JAVA_OPTIONS, FIELDS, GATK4_alt)
	## 2)SEARCH MULTI-SNPS CODONS
	thread = print_step(keep_anim, 6)
	TWO_VAR(TMPDIR)
	THREE_VAR(TMPDIR)
	end_print_step(keep_anim, thread, 6)
	## 3)CHECK IF SNPCLUSTERS IMPACT CODON
	thread = print_step(keep_anim, 7)
	PARS_OUT_VAR(TMPDIR)
	end_print_step(keep_anim, thread, 7)
	## 4) EXTRACT SNPCLUSTERS (Keeping SnpCluster if >=1 sample is Ref-Heterozygous or nonRef-Homozygous)
	thread = print_step(keep_anim, 8)
	ZYGO_PAIR(OUTF, TMPDIR, FIELDS)
	end_print_step(keep_anim, thread, 8)
	print(footer)
	if not(KEEP_TMP):
		subprocess.check_output("rm -r " + TMPDIR, shell=True)