Skip to content

Commit

Permalink
Minor release. Reformatted doc strings, fixed broad try-except clause
Browse files Browse the repository at this point in the history
All the "save_figure" options were changed to "False". One of the functions takes a list, and the default was wrongly set to a tuple. Minor release.
  • Loading branch information
lperezmo committed Aug 22, 2020
1 parent 51d0102 commit 62034be
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 73 deletions.
5 changes: 5 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,8 @@ History
------------------

* Second release on PyPI.

0.2.4 (2020-08-22)
------------------

* Minor release on PyPI.
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
# The short X.Y version.
version = '0.2'
# The full version, including alpha/beta/rc tags.
release = '0.2.3'
release = '0.2.4'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion flowsym/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

__author__ = """Luis Perez Morales, Michael M. Shavlik"""
__email__ = 'lperezmo@uoregon.edu'
__version__ = '0.2.3'
__version__ = '0.2.4'

# Import the main module in this package
from flowsym.flowsym import *
Expand Down
165 changes: 95 additions & 70 deletions flowsym/flowsym.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,12 @@
from scipy.stats import ks_2samp
from sklearn.mixture import GaussianMixture

# To make Travis happy. Attempt absolute path first and then from raw Github file
try:
spectrum_data = pd.read_csv('flowsym/data/FPbase_Spectra_updated.csv').fillna(value=0)
except:
spectrum_data = pd.read_csv('https://raw.githubusercontent.com/harmslab/flowsym/master/flowsym/data/FPbase_Spectra_updated.csv').fillna(value=0)
# Import from raw Github file
spectrum_data = pd.read_csv('https://raw.githubusercontent.com/harmslab/flowsym/master/flowsym/data'
'/FPbase_Spectra_updated.csv').fillna(value=0)


def create_controls(size, colors=('blue', 'cyan', 'green', 'yellow', 'orange', 'red', 'far_red', 'nir', 'ir')):
def create_controls(size, colors=['blue', 'cyan', 'green', 'yellow', 'orange', 'red', 'far_red', 'nir', 'ir']):
"""
This is a function that takes a DataFrame size (i.e. number of controls) and
a list of colors the user wants to run controls for.
Expand Down Expand Up @@ -134,9 +132,10 @@ def create_controls(size, colors=('blue', 'cyan', 'green', 'yellow', 'orange', '
def create_sample(size, colors=['blue', 'cyan', 'green', 'yellow', 'orange', 'red', 'far_red', 'nir', 'ir'],
weights=[]):
"""
This is a function that takes a defined dataframe length for number of samples (int)
and excitation and emission wavelengths (list,list). Assumes equal probability of each
color unless specified by the user.
This is a function that takes a defined dataframe length for
number of samples (int) and excitation and emission wavelengths
(list,list). Assumes equal probability of each color unless
specified by the user.
Parameters
----------
Expand Down Expand Up @@ -239,22 +238,25 @@ def create_sample(size, colors=['blue', 'cyan', 'green', 'yellow', 'orange', 're

# Bandwidth on lasers is +-5 nm. channels are [450+-25, 525+-25, 600+-30, 665+-15, 720+-30, 785+-30] for filter set 2
def measure(dataframe, lasers=[405, 488, 561, 638], channels=[1, 2, 3, 4, 5, 6],
create_fcs=True, outfile_name='data/sample_output.fcs'):
create_fcs=False, outfile_name='sample_output.fcs'):
"""
This is a function that will measure fluorescence intensity for any given sample
DataFrame and laser/channel parameters. Output will be an fcs file (default) that is
the same size as the sample you ran in the function. Alternatively, you can return
just a pandas DataFrame object by setting return_fcs=False. The user can set the output
file name manually to simulate creating multiple samples and measurements.
This is a function that will measure fluorescence
intensity for any given sample DataFrame and laser/channel
parameters. Output will be just a pandas DataFrame object
because return_fcs=False by default.
Alternatively, you can return fcs file if return_fcs = True.
The user can set the output file name manually to simulate
creating multiple samples and measurements.
Parameters
----------
dataframe : the Dataframe of sample data that will be used to generate the simulated
fluorescence intensity
dataframe : the Dataframe of sample data that will be used
to generate the simulated fluorescence intensity
lasers : laser channel parameters, default are [405, 488, 561, 638] nm
channels: return output for select channels, options are [1,2,3,4,5,6]
create_fcs : create a .fcs file from generated Pandas Dataframe using 'fcsy' module.
Default = True.
create_fcs : create a .fcs file from generated Pandas Dataframe
using 'fcsy' module. Default = True.
outfile_name : name of the .fcs file created
Returns
Expand Down Expand Up @@ -359,36 +361,49 @@ def measure(dataframe, lasers=[405, 488, 561, 638], channels=[1, 2, 3, 4, 5, 6],
return output


def cluster(measured_data, min_cluster_size=50, savefig=True):
def cluster(measured_data, min_cluster_size=50, savefig=False):
"""
This is a function to cluster flow cytometry data that has been measured in fluorescence channels using
density-based spatial clustering of applications with noise (DBSCAN), which clusters based on density of points
in an unsupervised method. The number of clusters does not need to be explicitly stated by the users. The only
parameter that needs to be optimized is min_cluster_size, which is set to 50 here. But I recommend 1% of the len(
data) Resulting plots are a bar chart showing the number of cells in each cluster and a heatmap of the median
fluorescence intensity in each channel for each cluster.
Note: clusters that are labeled '0' are cells that the DBSCAN could not cluster.
Returns a tuple of two dictionaries. The first dictionary is the median fluorescence represented in the heatmap
while the second dictionary holds all the fluorescence vectors for each cluster. Both of these are needed
for a dip test and re-clustering.
This is a function to cluster flow cytometry data that
has been measured in fluorescence channels using density-based
spatial clustering of applications with noise (DBSCAN), which
clusters based on density of points in an unsupervised method.
The number of clusters does not need to be explicitly stated by
the users. The only parameter that needs to be optimized is
min_cluster_size, which is set to 50 here. But I recommend 1% of
the len(data) Resulting plots are a bar chart showing the number
of cells in each cluster and a heatmap of the median fluorescence
intensity in each channel for each cluster.
Note: clusters that are labeled '0' are cells that the
DBSCAN could not cluster.
Returns a tuple of two dictionaries. The first dictionary is the
median fluorescence represented in the heatmap while the second
dictionary holds all the fluorescence vectors for each cluster.
Both of these are needed for a dip test and re-clustering.
Parameters
----------
measured_data : simulated or experimental flow cytometry data that has been measured in
fluorescence channels
min_cluster_size : default = 50, needs to be optimized by user. Typically needs to be
1% of len(data).
savefig: Save generated bar chart showing the number of cells in each cluster and a heat map
of the median fluorescence intensity in each channel for each cluster.
measured_data : simulated or experimental flow cytometry data
that has been measured in fluorescence channels.
min_cluster_size : default = 50, needs to be optimized by user.
Typically needs to be 1% of len(data).
savefig: Save generated bar chart showing the number of cells in
each cluster and a heat map of the median fluorescence
intensity in each channel for each cluster.
Figure is saved using 'matplotlib' module.
Returns
-------
output : a tuple of two dictionaries. The first dictionary is the median fluorescence represented
in the heatmap while the second dictionary holds all the fluorescence vectors for each
cluster. Both of these are needed for a dip test and re-clustering.
(final_dictionary, cluster_dict) : a tuple of two dictionaries.
The first dictionary is the
median fluorescence represented
in the heatmap while the second
dictionary holds all the
fluorescence vectors for each
cluster.
Both of these are needed for a
dip test and re-clustering.
See Also
--------
Expand Down Expand Up @@ -473,32 +488,41 @@ def cluster(measured_data, min_cluster_size=50, savefig=True):
return (final_dictionary, cluster_dict)


def dip_test(median_FL_data, total_data, alpha=0.05, save_figure=True):
def dip_test(median_FL_data, total_data, alpha=0.05, save_figure=False):
"""
Perform a Hartigan's dip test to check for unimodality in clusters and splits clusters if bimodality is found.
This function will take the highest intensity channel for each cluster and
check for bimodality to correct for errors in clustering similar fluorescence profiles.
Changing alpha will alter how stringent the dip test is. A higher alpha will result in higher detection
of bimodality, but runs a greater risk of false identification. It is important to note
that this dip test is relatively coarse grained and will not identify very slight populations
of mixed cells (e.g. 10 orange cells clustered with 1000 red cells)
Returns an updated clustering of the primary clustering after performing a dip test
Perform a Hartigan's dip test to check for unimodality
in clusters and splits clusters if bimodality is found.
This function will take the highest intensity channel
for each cluster and check for bimodality to correct for
errors in clustering similar fluorescence profiles.
Changing alpha will alter how stringent the dip test is.
A higher alpha will result in higher detection of bimodality,
but runs a greater risk of false identification. It is
important to note that this dip test is relatively coarse
grained and will not identify very slight populations of mixed
cells (e.g. 10 orange cells clustered with 1000 red cells).
Returns an updated clustering of the primary clustering
after performing a dip test.
Parameters
----------
median_FL_data : dict, clustering data generated by 'flowsym.cluster' function
total_data : other fluorescence profiles for which errors will be corrected
alpha: how stringent the dip test is
save_figure : Save generated bar chart showing the number of cells in each cluster and a heat map
of the median fluorescence intensity in each channel for each cluster.
Figure is saved using 'matplotlib' module.
median_FL_data : dict, clustering data generated by
'flowsym.cluster' function
total_data : other fluorescence profiles for which errors
will be corrected
alpha : how stringent the dip test is
save_figure : Save generated bar chart showing the number of
cells in each cluster and a heat map of the median
fluorescence intensity in each channel for each
cluster. Figure is saved using 'matplotlib' module.
Returns
-------
output : a tuple of two dictionaries. The first dictionary is the median fluorescence represented
in the heatmap while the second dictionary holds all the fluorescence vectors for each
cluster. Both of these are needed for a dip test and re-clustering.
change_dict : a dictory containing the corection that must be
applied to similar fluorescence profiles if
bimodality is found.
See Also
--------
Expand Down Expand Up @@ -627,7 +651,7 @@ def dip_test(median_FL_data, total_data, alpha=0.05, save_figure=True):
return change_dict


def gaus_recluster(median_FL_data, total_data, tolerance=.25, savefig=True):
def gaus_recluster(median_FL_data, total_data, tolerance=.25, save_figure=False):
"""
Applies a gaussian mixture model with n_components=2
to try and separate rare populations of cells from
Expand All @@ -649,11 +673,11 @@ def gaus_recluster(median_FL_data, total_data, tolerance=.25, savefig=True):
----------
median_FL_data : data with median FL for each cluster
total_data : data with all measured FL for each cluster
tolerance : how different do the sizes of clusters have to be before they
are considered actually distinct spectrally?
tolerance : how different do the sizes of clusters have
to be before they are considered actually distinct?
Increase this to be more stringent in splitting clusters.
Decrease the value to allow more re-clustering at the cost of
false positives.
Decrease the value to allow more re-clustering at
the cost of false positives.
save_figure : Save figure using 'matplotlib' module.
Returns
Expand Down Expand Up @@ -712,11 +736,12 @@ def gaus_recluster(median_FL_data, total_data, tolerance=.25, savefig=True):
# Do a ks 2 test to see if clusters are different
result = ks_2samp(clust1[max_channel], clust2[max_channel])

# Test how different our cluster populations are. If the difference between the sizes is more than <tolerance>, of the
# total, then we'll say we actually found a bimodal population to split
# Test how different our cluster populations are. If the difference between the sizes is more than
# <tolerance>, of the total, then we'll say we actually found a bimodal population to split
clust_split = abs(len(clust1) - len(clust2)) / (len(clust1) + len(clust2))

# Keep the split clusters if they meet our splitting criteria, otherwise retain original clusters from DB scan
# Keep the split clusters if they meet our splitting criteria, otherwise retain original clusters from DB
# scan
if clust_split > tolerance:
if result[1] < 1e-10:
new_val = clust1.values.tolist()
Expand All @@ -736,7 +761,7 @@ def gaus_recluster(median_FL_data, total_data, tolerance=.25, savefig=True):

plt.tight_layout()

if savefig:
if save_figure:
plt.savefig('gaus_mix_cluster_split')

final_reclustered = {}
Expand Down Expand Up @@ -788,7 +813,7 @@ def gaus_recluster(median_FL_data, total_data, tolerance=.25, savefig=True):
plt.yticks(rotation=0)
plt.tight_layout()

if savefig:
if save_figure:
plt.savefig('reclustered_after_gaus_mix_ks2')

return reclustered
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,6 @@
test_suite='tests',
tests_require=test_requirements,
url='https://github.com/harmslab/flowsym',
version='0.2.3',
version='0.2.4',
zip_safe=False,
)

0 comments on commit 62034be

Please sign in to comment.