Skip to content

Commit

Permalink
ConceptStats: punt on running it, for now
Browse files Browse the repository at this point in the history
  • Loading branch information
dckc committed Sep 18, 2019
1 parent fb9531a commit 22e9fdf
Showing 1 changed file with 6 additions and 15 deletions.
21 changes: 6 additions & 15 deletions tumor_reg_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,20 +958,9 @@ def make_tumor_schema(cls,
if IO_TESTING:
_spark.table('naaccr_observations').toPandas().to_csv(_cwd / 'naaccr_observations.csv', index=False)


# %% [markdown]
# ## Concept stats
#
# Does Spark SQL have a good hash function? looks like it...

# %%
_SQL("""
select k, count(*) from (
select hash(c_fullname) as k from naaccr_ontology
)
group by k
having count(*) > 1
""")


# %%
class ConceptStats:
Expand All @@ -988,9 +977,11 @@ def make(cls, spark: SparkSession_T,
return list(views.values())[-1]


if IO_TESTING:
ConceptStats.make(_spark, _spark.table('naaccr_ontology'), _spark.table('naaccr_observations'))
_SQL('select * from concept_stats order by c_fullname', limit=15)
# if IO_TESTING:
# ConceptStats.make(_spark,
# _spark.table('naaccr_ontology').sample(False, 0.01),
# _spark.table('naaccr_observations1'))
# _SQL('select * from concept_stats order by c_fullname', limit=15)

# %% [markdown]
# ## Oracle DB Access
Expand Down

0 comments on commit 22e9fdf

Please sign in to comment.