Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize HGMD information search in Feature Engineering Part 1 #61

Merged
merged 1 commit into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 3 additions & 63 deletions bin/annotation/utils_for_marrvel_flatfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,94 +199,34 @@ def getClinVarUsingMarrvelFlatFile(varObj, clinvarAlleleDf, clinvarGeneDf):
return retList


def getHGMDUsingFlatFile(varObj, hgmdDf):
def getHGMDUsingFlatFile(varObj, hgmdHPOScoreGeneSortedDf):
"""
function to get HGMD from local flat file
Params:
varObj:a varaint object read from VEP annotation
hgmdDf: HGMD data frame read from local file (CL: now it refers to hgmdHPOScoreDf in main.py)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for clean the comments up

hgmdHPOScoreGeneSortedDf: HGMD data frame read from local file

Returns:
List of HGMD annotations

Update by CL:
"""
# print('\nin HGMD')
# print('\tvar:', varObj.varId_dash, 'var-gene:', varObj.geneSymbol)
HGMDDict = {}
hgmdGeneFound = 0
hgmdVarFound = 0
hgmdVarPhenIdList = []
hgmdVarHPOIdList = []
hgmdVarHPOStrList = []
chromVal = varObj.chrom
posVal = int(varObj.pos)
startVal = int(varObj.start)
stopVal = int(varObj.stop)
# print('\tpos type:',type(varObj.start),'chrom:', type(chromVal) )

"""
#using int columns
if 1:
vals=hgmdDf[ ( hgmdDf['chromosome'] == chromVal ) & ( hgmdDf['startCoord']==startVal ) & (hgmdDf['endCoord']==stopVal) ]
numRows=len(vals.index)
#using index
if 0:
idVal=str(chromVal)+'_'+str(startVal)+'_'+str(stopVal)
try:
vals=hgmdVarDf.loc[idVal]
numRows=len(vals.index)
print('index numRows:', numRows)
except:
numRows=0

print('\tvar numRows:', numRows)


if numRows>0:
hgmdVarFound=1
print('\tnumrows:',numRows)
print('\tvals:', vals)
if 'phen_id' in vals:
hgmdVarPhenIdList.extend(vals['phen_id'].tolist())
if 'hpo_id' in vals:
hgmdVarHPOIdList.extend(vals['hpo_id'].tolist())
if 'hpo_str' in vals:
hgmdVarHPOStrList.extend(vals['hpo_str'].tolist())
print('\tvals:', vals)
"""
# CL: check VarFound
if varObj.hgmd_id != "-":
hgmdVarFound = 1
else:
hgmdVarFound = 0

"""
#check gene
#vals=hgmdGeneDf[(hgmdGeneDf['gene']==varObj.geneSymbol)]
print('\t1 HGMD geneSymbol:', varObj.geneSymbol)
try:
print('\t2 HGMD geneSymbol:', varObj.geneSymbol)
vals=hgmdDf.loc[varObj.geneSymbol]
#vals=hgmdDf[ ( hgmdDf['gene'] == varObj.geneSymbol ) ]
numRows=len(vals.index)
except:
numRows=0

print('\tHGMD gene found numRows:', numRows)
if numRows>0:
hgmdGeneFound=1
"""
# CL: check geneFound
if np.any(hgmdDf["gene_sym"].isin([varObj.geneSymbol])):
if varObj.geneSymbol in hgmdHPOScoreGeneSortedDf.index:
hgmdGeneFound = 1
else:
hgmdGeneFound = 0

# print('\thgmdVarFound:',hgmdVarFound,'hgmdGeneFound:',hgmdGeneFound,
# 'hgmdVarPhenIdList:',hgmdVarPhenIdList,'hgmdVarHPOIdList:',hgmdVarHPOIdList,
# 'hgmdVarHPOStrList:',hgmdVarHPOStrList)
# return
retList = [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure we need other things here, but let's keep them then remove in future PR.

hgmdVarFound,
hgmdGeneFound,
Expand Down
8 changes: 4 additions & 4 deletions bin/annotation/utils_for_marrvel_flatfile_module_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,9 +394,9 @@ def getAnnotateInfoRow_3_5(

def getAnnotateInfoRow_3_6(
varObj,
hgmdHPOScoreDf,
hgmdHPOScoreGeneSortedDf,
):
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreDf)
hgmdRet = getHGMDUsingFlatFile(varObj, hgmdHPOScoreGeneSortedDf)

return {
"hgmdVarFound": hgmdRet[0],
Expand All @@ -414,7 +414,7 @@ def getAnnotateInfoRows_3(
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
hgmdHPOScoreGeneSortedDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
Expand Down Expand Up @@ -446,7 +446,7 @@ def f6(row):
if "curate" not in moduleList:
return row
return getAnnotateInfoRow_3_6(
row, hgmdHPOScoreDf
row, hgmdHPOScoreGeneSortedDf
)

annotateInfoDf = vepDf.apply(f1, axis=1, result_type='expand')
Expand Down
40 changes: 6 additions & 34 deletions bin/annotation/utils_for_symMatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def omimSymMatch(varObj, omimHPOScoreDf, inFileType):
# print('\tomimSymMatchFlag:', varObj.omimSymMatchFlag)


def hgmdSymMatch(varObj, hgmdHPOScoreDf):
def hgmdSymMatch(varObj, hgmdHPOScoreAccSortedDf, hgmdHPOScoreGeneSortedDf):
"""
Find HGMD symptom match score
Param:
Expand All @@ -71,44 +71,16 @@ def hgmdSymMatch(varObj, hgmdHPOScoreDf):
# print('\nin HGMDSymMatch')
# print('\tvar:', varObj.varId_dash)
hgmdSymptomSimScore = "-"
""" old version
if varObj.hgmdVarFound:
var_tmp = varObj.varId_dash.split('-')
var_tmp = 'chr%s:%s %s>%s'%(var_tmp[0],var_tmp[1],var_tmp[2],var_tmp[3])
print('\tvar_tmp:', var_tmp)
if var_tmp in hgmdHPOScoreDf['hgvs'].tolist():
varDf = hgmdHPOScoreDf[hgmdHPOScoreDf['hgvs'] == var_tmp]
varScore = max(varDf['Similarity_Score'].tolist())
varObj.hgmdSymptomScore = varScore
if varScore >= 0.2:
varObj.hgmdSymMatchFlag = 1
hgmdSymptomSimScore=varScore
else:
pass
elif varObj.hgmdGeneFound:
if varObj.geneSymbol in hgmdHPOScoreDf['Gene'].tolist():
geneDf = hgmdHPOScoreDf[hgmdHPOScoreDf['Gene'] == varObj.geneSymbol]
geneScore = max(geneDf['Similarity_Score'].tolist())
if geneScore >= 0.2:
varObj.hgmdSymMatchFlag = 1
hgmdSymptomSimScore=geneScore
#store
varObj.hgmdSymptomSimScore=hgmdSymptomSimScore
print('hgmdSymMatch results:')
print('\thgmdSymMatchFlag:', varObj.hgmdSymMatchFlag)
print('\thgmdSymptomSimScore:', varObj.hgmdSymptomSimScore)
"""
if np.any(hgmdHPOScoreDf["acc_num"].isin([varObj.hgmd_id])):
varDf = hgmdHPOScoreDf[hgmdHPOScoreDf["acc_num"] == varObj.hgmd_id]
varScore = max(varDf["Similarity_Score"].tolist())
if varObj.hgmd_id in hgmdHPOScoreAccSortedDf.index:
varScore = hgmdHPOScoreAccSortedDf.loc[varObj.hgmd_id].Similarity_Score

varObj.hgmdSymptomScore = varScore
if varScore >= 0.2:
varObj.hgmdSymMatchFlag = 1
hgmdSymptomSimScore = varScore

elif varObj.hgmdGeneFound:
geneDf = hgmdHPOScoreDf[hgmdHPOScoreDf["gene_sym"] == varObj.geneSymbol]
geneScore = max(geneDf["Similarity_Score"].tolist())
geneScore = hgmdHPOScoreGeneSortedDf.loc[varObj.geneSymbol].Similarity_Score

if geneScore >= 0.2:
varObj.hgmdSymMatchFlag = 1
hgmdSymptomSimScore = geneScore
Expand Down
24 changes: 6 additions & 18 deletions bin/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def main():
# initialization
dgvDf = []
decipherDf = []
hgmdDf = []
omimHPOScoreDf = []
hgmdHPOScoreDf = []
clinvarGeneDf = []
Expand All @@ -148,7 +147,7 @@ def main():
if args.genomeRef == "hg38":
fileName = "annotate/anno_hg19/gene_clinvar.csv"
else:
fileName = "annotate/anno_hg19/gene_clinvar.csv"
fileName = "annotate/anno_hg19/gene_clinvar.csv"

clinvarGeneDf = pd.read_csv(fileName, sep=",")
# sort by gene name
Expand All @@ -162,19 +161,7 @@ def main():

with open(fileName) as f:
omimGeneList = json.load(f)

if debugFlag == 1:
for omimGeneDict in omimGeneList:
print("type of omimGeneDict:", type(omimGeneDict))
print("keys:", omimGeneDict.keys())
for keyVal in omimGeneDict.keys():
print("keyVal:", keyVal)
print("\tsubkeys type:", type(omimGeneDict[keyVal]))
if isinstance(omimGeneDict[keyVal], list):
print("\n\t\tfound list")
print("\t\t type:", type(omimGeneDict[keyVal]))

break
omimGeneDf = pd.DataFrame(omimGeneList)

# read the OMIM allele file
fileName = "annotate/anno_hg19/omim_alleric_variants.json"
Expand Down Expand Up @@ -298,8 +285,9 @@ def main():
gnomadMetricsGeneSortedDf = gnomadMetricsGeneDf.groupby('gene').first().sort_index()

if "curate" in moduleList:
omimGeneDf = pd.DataFrame(omimGeneList)
omimGeneSortedDf = omimGeneDf.set_index('geneSymbol').sort_index()
hgmdHPOScoreGeneSortedDf = hgmdHPOScoreDf.groupby('gene_sym').first().sort_index()
hgmdHPOScoreAccSortedDf = hgmdHPOScoreDf.groupby('acc_num').first().sort_index()

annotateInfoDf = getAnnotateInfoRows_3(
varDf,
Expand All @@ -308,7 +296,7 @@ def main():
clinvarAlleleDf,
omimGeneSortedDf,
omimAlleleList,
hgmdHPOScoreDf,
hgmdHPOScoreGeneSortedDf,
moduleList,
decipherSortedDf,
gnomadMetricsGeneSortedDf,
Expand Down Expand Up @@ -359,7 +347,7 @@ def f(varObj):
for i, varObj in annotateInfoDf.iterrows():
# the curate score is under the utils_1.py file
omimSymMatch(varObj, omimHPOScoreDf, args.inFileType)
hgmdSymMatch(varObj, hgmdHPOScoreDf)
hgmdSymMatch(varObj, hgmdHPOScoreAccSortedDf, hgmdHPOScoreGeneSortedDf)
clinVarSymMatch(varObj, args.inFileType)
# OMIM and clinvar info
retList = getCurationScore(
Expand Down
Loading