-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrelatednames_table.py
executable file
·48 lines (39 loc) · 1.62 KB
/
relatednames_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python3
###
import sys,os,io,csv,click,logging
import pandas as pd
#logging.basicConfig(format='%(levelname)s:%(message)s', level=(logging.INFO))
logging.basicConfig(format='%(levelname)s:%(message)s', level=(logging.DEBUG))
@click.command()
@click.option("-i", "--input_file", required=True, type=click.Path(file_okay=True, dir_okay=False, exists=True), help="Input CSV|TSV file.")
@click.option("--tsv", default=False, help="Input TSV flag (default is CSV)")
def main(input_file, tsv):
"""Split Loinc.csv relatednames2 column and create new table."""
if not input_file:
click.echo("ERROR: no input_file.")
df_in = pd.read_csv(input_file, sep=("\t" if tsv else ","), low_memory=False)
logging.info(f"Input rows: {df_in.shape[0]}")
buff = io.StringIO()
#df_in.info(buff)
#logging.info(buff.getvalue())
df_out = df_in.loc[:, ["LOINC_NUM", "RELATEDNAMES2"]]
#df_out.info(buff)
#logging.info(buff.getvalue())
df_out.set_index("LOINC_NUM", drop=True, inplace=True)
#df_out.info(buff)
#logging.info(buff.getvalue())
# Split field on semicolons into separate rows indexed by LOINC_NUM.
s = df_out['RELATEDNAMES2'].str.split(r"\s*;\s*").apply(pd.Series).stack()
s.index = s.index.droplevel(-1) # to line up with df's index
s.name = 'RELATEDNAME' # needs a name to join
del df_out['RELATEDNAMES2']
df_out = df_out.join(s)
df_out.drop_duplicates(inplace=True)
#df_out.info(buff)
#logging.info(buff.getvalue())
logging.debug(df_out.head())
df_out.to_csv(sys.stdout, sep="\t", index=True)
logging.info(f"Output rows: {df_out.shape[0]}")
buff.close()
if __name__ == '__main__':
main()