Skip to content

Commit

Permalink
Merge pull request #61 from LiuzLab/fe1
Browse files Browse the repository at this point in the history
Optimize HGMD information search in Feature Engineering Part 1
  • Loading branch information
hyunhwan-bcm authored Aug 19, 2024
2 parents 73c5aed + 49dd7ce commit d04d442
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 119 deletions.
66 changes: 3 additions & 63 deletions bin/annotation/utils_for_marrvel_flatfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,94 +199,34 @@ def getClinVarUsingMarrvelFlatFile(varObj, clinvarAlleleDf, clinvarGeneDf):
return retList


def getHGMDUsingFlatFile(varObj, hgmdDf):
def getHGMDUsingFlatFile(varObj, hgmdHPOScoreGeneSortedDf):
"""
function to get HGMD from local flat file
Params:
varObj:a varaint object read from VEP annotation
hgmdDf: HGMD data frame read from local file (CL: now it refers to hgmdHPOScoreDf in main.py)
hgmdHPOScoreGeneSortedDf: HGMD data frame read from local file
Returns:
List of HGMD annotations
Update by CL:
"""
# print('\nin HGMD')
# print('\tvar:', varObj.varId_dash, 'var-gene:', varObj.geneSymbol)
HGMDDict = {}
hgmdGeneFound = 0
hgmdVarFound = 0
hgmdVarPhenIdList = []
hgmdVarHPOIdList = []
hgmdVarHPOStrList = []
chromVal = varObj.chrom
posVal = int(varObj.pos)
startVal = int(varObj.start)
stopVal = int(varObj.stop)
# print('\tpos type:',type(varObj.start),'chrom:', type(chromVal) )

"""
#using int columns
if 1:
vals=hgmdDf[ ( hgmdDf['chromosome'] == chromVal ) & ( hgmdDf['startCoord']==startVal ) & (hgmdDf['endCoord']==stopVal) ]
numRows=len(vals.index)
#using index
if 0:
idVal=str(chromVal)+'_'+str(startVal)+'_'+str(stopVal)
try:
vals=hgmdVarDf.loc[idVal]
numRows=len(vals.index)
print('index numRows:', numRows)
except:
numRows=0
print('\tvar numRows:', numRows)
if numRows>0:
hgmdVarFound=1
print('\tnumrows:',numRows)
print('\tvals:', vals)
if 'phen_id' in vals:
hgmdVarPhenIdList.extend(vals['phen_id'].tolist())
if 'hpo_id' in vals:
hgmdVarHPOIdList.extend(vals['hpo_id'].tolist())
if 'hpo_str' in vals:
hgmdVarHPOStrList.extend(vals['hpo_str'].tolist())
print('\tvals:', vals)
"""
# CL: check VarFound
if varObj.hgmd_id != "-":
hgmdVarFound = 1
else:
hgmdVarFound = 0

"""
#check gene
#vals=hgmdGeneDf[(hgmdGeneDf['gene']==varObj.geneSymbol)]
print('\t1 HGMD geneSymbol:', varObj.geneSymbol)
try:
print('\t2 HGMD geneSymbol:', varObj.geneSymbol)
vals=hgmdDf.loc[varObj.geneSymbol]
#vals=hgmdDf[ ( hgmdDf['gene'] == varObj.geneSymbol ) ]
numRows=len(vals.index)
except:
numRows=0
print('\tHGMD gene found numRows:', numRows)
if numRows>0:
hgmdGeneFound=1
"""
# CL: check geneFound
if np.any(hgmdDf["gene_sym"].isin([varObj.geneSymbol])):
if varObj.geneSymbol in hgmdHPOScoreGeneSortedDf.index:
hgmdGeneFound = 1
else:
hgmdGeneFound = 0

# print('\thgmdVarFound:',hgmdVarFound,'hgmdGeneFound:',hgmdGeneFound,
# 'hgmdVarPhenIdList:',hgmdVarPhenIdList,'hgmdVarHPOIdList:',hgmdVarHPOIdList,
# 'hgmdVarHPOStrList:',hgmdVarHPOStrList)
# return
retList = [
hgmdVarFound,
hgmdGeneFound,
Expand Down
8 changes: 4 additions & 4 deletions bin/annotation/utils_for_marrvel_flatfile_module_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,9 +394,9 @@ def getAnnotateInfoRow_3_5(

def getAnnotateInfoRow_3_6(
varObj,
hgmdHPOScoreDf,
hgmdHPOScoreGeneSortedDf,
):
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreDf)
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreGeneSortedDf)

return {
"hgmdVarFound": hgmdRet[0],
Expand All @@ -414,7 +414,7 @@ def getAnnotateInfoRows_3(
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
hgmdHPOScoreGeneSortedDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
Expand Down Expand Up @@ -446,7 +446,7 @@ def f6(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_6(
row, hgmdHPOScoreDf
row, hgmdHPOScoreGeneSortedDf
)

annotateInfoDf = vepDf.apply(f1, axis=1, result_type='expand')
Expand Down
40 changes: 6 additions & 34 deletions bin/annotation/utils_for_symMatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def omimSymMatch(varObj, omimHPOScoreDf, inFileType):
# print('\tomimSymMatchFlag:', varObj.omimSymMatchFlag)


def hgmdSymMatch(varObj, hgmdHPOScoreDf):
def hgmdSymMatch(varObj, hgmdHPOScoreAccSortedDf, hgmdHPOScoreGeneSortedDf):
"""
Find HGMD symptom match score
Param:
Expand All @@ -71,44 +71,16 @@ def hgmdSymMatch(varObj, hgmdHPOScoreDf):
# print('\nin HGMDSymMatch')
# print('\tvar:', varObj.varId_dash)
hgmdSymptomSimScore = "-"
""" old version
if varObj.hgmdVarFound:
var_tmp = varObj.varId_dash.split('-')
var_tmp = 'chr%s:%s %s>%s'%(var_tmp[0],var_tmp[1],var_tmp[2],var_tmp[3])
print('\tvar_tmp:', var_tmp)
if var_tmp in hgmdHPOScoreDf['hgvs'].tolist():
varDf = hgmdHPOScoreDf[hgmdHPOScoreDf['hgvs'] == var_tmp]
varScore = max(varDf['Similarity_Score'].tolist())
varObj.hgmdSymptomScore = varScore
if varScore >= 0.2:
varObj.hgmdSymMatchFlag = 1
hgmdSymptomSimScore=varScore
else:
pass
elif varObj.hgmdGeneFound:
if varObj.geneSymbol in hgmdHPOScoreDf['Gene'].tolist():
geneDf = hgmdHPOScoreDf[hgmdHPOScoreDf['Gene'] == varObj.geneSymbol]
geneScore = max(geneDf['Similarity_Score'].tolist())
if geneScore >= 0.2:
varObj.hgmdSymMatchFlag = 1
hgmdSymptomSimScore=geneScore
#store
varObj.hgmdSymptomSimScore=hgmdSymptomSimScore
print('hgmdSymMatch results:')
print('\thgmdSymMatchFlag:', varObj.hgmdSymMatchFlag)
print('\thgmdSymptomSimScore:', varObj.hgmdSymptomSimScore)
"""
if np.any(hgmdHPOScoreDf["acc_num"].isin([varObj.hgmd_id])):
varDf = hgmdHPOScoreDf[hgmdHPOScoreDf["acc_num"] == varObj.hgmd_id]
varScore = max(varDf["Similarity_Score"].tolist())
if varObj.hgmd_id in hgmdHPOScoreAccSortedDf.index:
varScore = hgmdHPOScoreAccSortedDf.loc[varObj.hgmd_id].Similarity_Score

varObj.hgmdSymptomScore = varScore
if varScore >= 0.2:
varObj.hgmdSymMatchFlag = 1
hgmdSymptomSimScore = varScore

elif varObj.hgmdGeneFound:
geneDf = hgmdHPOScoreDf[hgmdHPOScoreDf["gene_sym"] == varObj.geneSymbol]
geneScore = max(geneDf["Similarity_Score"].tolist())
geneScore = hgmdHPOScoreGeneSortedDf.loc[varObj.geneSymbol].Similarity_Score

if geneScore >= 0.2:
varObj.hgmdSymMatchFlag = 1
hgmdSymptomSimScore = geneScore
Expand Down
24 changes: 6 additions & 18 deletions bin/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def main():
# initialization
dgvDf = []
decipherDf = []
hgmdDf = []
omimHPOScoreDf = []
hgmdHPOScoreDf = []
clinvarGeneDf = []
Expand All @@ -148,7 +147,7 @@ def main():
if args.genomeRef == "hg38":
fileName = "annotate/anno_hg19/gene_clinvar.csv"
else:
fileName = "annotate/anno_hg19/gene_clinvar.csv"
fileName = "annotate/anno_hg19/gene_clinvar.csv"

clinvarGeneDf = pd.read_csv(fileName, sep=",")
# sort by gene name
Expand All @@ -162,19 +161,7 @@ def main():

with open(fileName) as f:
omimGeneList = json.load(f)

if debugFlag == 1:
for omimGeneDict in omimGeneList:
print("type of omimGeneDict:", type(omimGeneDict))
print("keys:", omimGeneDict.keys())
for keyVal in omimGeneDict.keys():
print("keyVal:", keyVal)
print("\tsubkeys type:", type(omimGeneDict[keyVal]))
if isinstance(omimGeneDict[keyVal], list):
print("\n\t\tfound list")
print("\t\t type:", type(omimGeneDict[keyVal]))

break
omimGeneDf = pd.DataFrame(omimGeneList)

# read the OMIM allele file
fileName = "annotate/anno_hg19/omim_alleric_variants.json"
Expand Down Expand Up @@ -298,8 +285,9 @@ def main():
gnomadMetricsGeneSortedDf = gnomadMetricsGeneDf.groupby('gene').first().sort_index()

if "curate" in moduleList:
omimGeneDf = pd.DataFrame(omimGeneList)
omimGeneSortedDf = omimGeneDf.set_index('geneSymbol').sort_index()
hgmdHPOScoreGeneSortedDf = hgmdHPOScoreDf.groupby('gene_sym').first().sort_index()
hgmdHPOScoreAccSortedDf = hgmdHPOScoreDf.groupby('acc_num').first().sort_index()

annotateInfoDf = getAnnotateInfoRows_3(
varDf,
Expand All @@ -308,7 +296,7 @@ def main():
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
hgmdHPOScoreGeneSortedDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
Expand Down Expand Up @@ -359,7 +347,7 @@ def f(varObj):
for i, varObj in annotateInfoDf.iterrows():
# the curate score is under the utils_1.py file
omimSymMatch(varObj, omimHPOScoreDf, args.inFileType)
hgmdSymMatch(varObj, hgmdHPOScoreDf)
hgmdSymMatch(varObj, hgmdHPOScoreAccSortedDf, hgmdHPOScoreGeneSortedDf)
clinVarSymMatch(varObj, args.inFileType)
# OMIM and clinvar info
retList = getCurationScore(
Expand Down

0 comments on commit d04d442

Please sign in to comment.