-
Notifications
You must be signed in to change notification settings - Fork 58
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ENH: describe as a replacement of AverageCharacter (#570)
* ENH: describe as a replacement of AverageCharacter * perf + fixup * limit_range array input * Apply suggestions from code review Co-authored-by: James Gaboardi <jgaboardi@gmail.com> * rename to y * move to diversity * move assert_result to its own file * add tests * skip mode tests on oldest * use agg * conftest --------- Co-authored-by: James Gaboardi <jgaboardi@gmail.com>
- Loading branch information
1 parent
cf73295
commit cef9290
Showing
10 changed files
with
225 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import numpy as np | ||
from libpysal.graph import Graph | ||
from numpy.typing import NDArray | ||
from pandas import DataFrame, Series | ||
from scipy import stats | ||
|
||
from ..utils import limit_range | ||
|
||
__all__ = ["describe"] | ||
|
||
|
||
def describe( | ||
y: NDArray[np.float_] | Series, | ||
graph: Graph, | ||
q: tuple[float, float] | None = None, | ||
include_mode: bool = False, | ||
) -> DataFrame: | ||
"""Describe the distribution of values within a set neighbourhood. | ||
Given the graph, computes the descriptive statisitcs of values within the | ||
neighbourhood of each node. Optionally, the values can be limited to a certain | ||
quantile range before computing the statistics. | ||
Notes | ||
----- | ||
The index of ``values`` must match the index along which the ``graph`` is | ||
built. | ||
Parameters | ||
---------- | ||
y : NDArray[np.float_] | Series | ||
An 1D array of numeric values to be described. | ||
graph : libpysal.graph.Graph | ||
Graph representing spatial relationships between elements. | ||
q : tuple[float, float] | None, optional | ||
Tuple of percentages for the percentiles to compute. Values must be between 0 | ||
and 100 inclusive. When set, values below and above the percentiles will be | ||
discarded before computation of the average. The percentiles are computed for | ||
each neighborhood. By default None. | ||
include_mode : False | ||
Compute mode along with other statistics. Default is False. Mode is | ||
computationally expensive and not useful for continous variables. | ||
Returns | ||
------- | ||
DataFrame | ||
A DataFrame with descriptive statistics. | ||
""" | ||
|
||
def _describe(values, q, include_mode=False): | ||
"""Helper function to calculate average.""" | ||
values = limit_range(values.values, q) | ||
|
||
results = [ | ||
np.mean(values), | ||
np.median(values), | ||
np.std(values), | ||
np.min(values), | ||
np.max(values), | ||
np.sum(values), | ||
] | ||
if include_mode: | ||
results.append(stats.mode(values, keepdims=False)[0]) | ||
return results | ||
|
||
if not isinstance(y, Series): | ||
y = Series(y) | ||
|
||
grouper = y.take(graph._adjacency.index.codes[1]).groupby( | ||
graph._adjacency.index.codes[0] | ||
) | ||
|
||
if q is None: | ||
stat_ = grouper.agg(["mean", "median", "std", "min", "max", "sum"]) | ||
if include_mode: | ||
stat_["mode"] = grouper.agg(lambda x: stats.mode(x, keepdims=False)[0]) | ||
else: | ||
agg = grouper.agg(_describe, q=q, include_mode=include_mode) | ||
stat_ = DataFrame(zip(*agg, strict=True)).T | ||
cols = ["mean", "median", "std", "min", "max", "sum"] | ||
if include_mode: | ||
cols.append("mode") | ||
stat_.columns = cols | ||
|
||
return stat_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
import pandas as pd | ||
import pytest | ||
from pandas.testing import assert_index_equal | ||
|
||
|
||
def assert_result(result, expected, geometry, **kwargs): | ||
"""Check the expected values and types of the result.""" | ||
for key, value in expected.items(): | ||
assert getattr(result, key)() == pytest.approx(value) | ||
assert isinstance(result, pd.Series) | ||
assert_index_equal(result.index, geometry.index, **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
import geopandas as gpd | ||
import pytest | ||
from libpysal.graph import Graph | ||
from packaging.version import Version | ||
from pandas.testing import assert_frame_equal | ||
|
||
import momepy as mm | ||
|
||
from .conftest import assert_result | ||
|
||
GPD_013 = Version(gpd.__version__) >= Version("0.13") | ||
|
||
|
||
class TestDistribution: | ||
def setup_method(self): | ||
test_file_path = mm.datasets.get_path("bubenec") | ||
self.df_buildings = gpd.read_file(test_file_path, layer="buildings") | ||
self.graph = Graph.build_knn(self.df_buildings.centroid, k=3) | ||
|
||
def test_describe(self): | ||
area = self.df_buildings.area | ||
r = mm.describe(area, self.graph) | ||
|
||
expected_mean = { | ||
"mean": 587.3761020554495, | ||
"sum": 84582.15869598472, | ||
"min": 50.44045729583316, | ||
"max": 1187.2662413659234, | ||
} | ||
assert_result(r["mean"], expected_mean, self.df_buildings, exact=False) | ||
|
||
expected_median = { | ||
"mean": 577.4640489818667, | ||
"sum": 83154.8230533888, | ||
"min": 50.43336175017242, | ||
"max": 1225.8094201694726, | ||
} | ||
assert_result(r["median"], expected_median, self.df_buildings, exact=False) | ||
|
||
expected_std = { | ||
"mean": 255.59307136480083, | ||
"sum": 36805.40227653132, | ||
"min": 0.05050450812944085, | ||
"max": 1092.484902679786, | ||
} | ||
assert_result(r["std"], expected_std, self.df_buildings, exact=False) | ||
|
||
expected_min = { | ||
"mean": 349.53354434499295, | ||
"sum": 50332.830385678986, | ||
"min": 50.39387578315866, | ||
"max": 761.0313042971973, | ||
} | ||
assert_result(r["min"], expected_min, self.df_buildings, exact=False) | ||
|
||
expected_max = { | ||
"mean": 835.1307128394886, | ||
"sum": 120258.82264888636, | ||
"min": 50.49413435416841, | ||
"max": 2127.7522277389035, | ||
} | ||
assert_result(r["max"], expected_max, self.df_buildings, exact=False) | ||
|
||
expected_sum = { | ||
"mean": 1762.128306166348, | ||
"sum": 253746.47608795413, | ||
"min": 151.32137188749948, | ||
"max": 3561.79872409777, | ||
} | ||
assert_result(r["sum"], expected_sum, self.df_buildings, exact=False) | ||
|
||
def test_describe_quantile(self): | ||
graph = Graph.build_knn(self.df_buildings.centroid, k=15) | ||
area = self.df_buildings.area | ||
r = mm.describe(area, graph, q=(25, 75)) | ||
|
||
expected_mean = { | ||
"mean": 601.6960154385389, | ||
"sum": 86644.2262231496, | ||
"min": 250.25984637364323, | ||
"max": 901.0028506943196, | ||
} | ||
assert_result(r["mean"], expected_mean, self.df_buildings, exact=False) | ||
|
||
@pytest.mark.skipif(not GPD_013, reason="get_coordinates() not available") | ||
def test_describe_mode(self): | ||
corners = mm.corners(self.df_buildings) | ||
r = mm.describe(corners, self.graph, include_mode=True) | ||
|
||
expected = { | ||
"mean": 6.152777777777778, | ||
"sum": 886, | ||
"min": 4, | ||
"max": 17, | ||
} | ||
assert_result(r["mode"], expected, self.df_buildings, exact=False) | ||
|
||
@pytest.mark.skipif(not GPD_013, reason="get_coordinates() not available") | ||
def test_describe_quantile_mode(self): | ||
graph = Graph.build_knn(self.df_buildings.centroid, k=15) | ||
corners = mm.corners(self.df_buildings) | ||
r = mm.describe(corners, graph, q=(25, 75), include_mode=True) | ||
|
||
expected = { | ||
"mean": 6.958333333333333, | ||
"sum": 1002.0, | ||
"min": 4.0, | ||
"max": 12, | ||
} | ||
assert_result(r["mode"], expected, self.df_buildings, exact=False) | ||
|
||
def test_describe_array(self): | ||
area = self.df_buildings.area | ||
r = mm.describe(area, self.graph) | ||
r2 = mm.describe(area.values, self.graph) | ||
|
||
assert_frame_equal(r, r2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters