-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcompute_stats.py
556 lines (458 loc) · 20.1 KB
/
compute_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
# Objective: compute a score for each Steam game and then rank all the games while favoring hidden gems.
import ast
from pathlib import Path
import numpy as np
from src.appids import APP_ID_CONTRADICTION
def compute_score_generic(
my_tuple,
parameter_list,
language=None,
popularity_measure_str=None,
quality_measure_str=None,
):
# Objective: compute a score for one Steam game.
#
# Input: - a my_tuple is a list consisting of all retrieved information regarding one game
# - parameter_list is a list of parameters to calibrate the ranking.
# Currently, there is only one parameter, alpha, which could be chosen up to one's tastes, or optimized.
# - optional language to allow to compute regional rankings of hidden gems
# - optional choice of popularity measure: either 'num_owners', or 'num_reviews'
# - optional choice of quality measure: either 'wilson_score' or 'bayesian_rating'
# Output: game score
alpha = parameter_list[0]
if language is None:
# noinspection PyUnusedLocal
wilson_score = my_tuple[1]
bayesian_rating = my_tuple[2]
num_owners = my_tuple[3]
num_players = my_tuple[4]
median_playtime = my_tuple[5]
average_playtime = my_tuple[6]
num_positive_reviews = my_tuple[7]
num_negative_reviews = my_tuple[8]
# noinspection PyUnusedLocal
num_owners = float(num_owners)
try:
# noinspection PyUnusedLocal
num_players = float(num_players)
except TypeError:
# noinspection PyUnusedLocal
num_players = None
# noinspection PyUnusedLocal
median_playtime = float(median_playtime)
# noinspection PyUnusedLocal
average_playtime = float(average_playtime)
num_positive_reviews = float(num_positive_reviews)
num_negative_reviews = float(num_negative_reviews)
num_reviews = num_positive_reviews + num_negative_reviews
else:
wilson_score = my_tuple[language]["wilson_score"]
bayesian_rating = my_tuple[language]["bayesian_rating"]
num_owners = my_tuple[language]["num_owners"]
num_reviews = my_tuple[language]["num_reviews"]
if quality_measure_str is None or quality_measure_str == "wilson_score":
quality_measure = wilson_score
else:
quality_measure = bayesian_rating
if popularity_measure_str is None or popularity_measure_str == "num_owners":
popularity_measure = num_owners
else:
popularity_measure = num_reviews
def decreasing_fun(x):
# Decreasing function
return alpha / (alpha + x)
return quality_measure * decreasing_fun(popularity_measure)
# noinspection PyPep8Naming
def rank_games(
d,
parameter_list,
verbose=False,
appid_reference_set=None,
language=None,
popularity_measure_str=None,
quality_measure_str=None,
num_top_games_to_print=1000,
filtered_app_ids_to_show=None,
filtered_app_ids_to_hide=None,
):
# Objective: rank all the Steam games, given a parameter alpha.
#
# Input: - local dictionary of data extracted from SteamSpy
# - parameter_list is a list of parameters to calibrate the ranking.
# - optional verbosity boolean
# - optional set of appID of games chosen as references of hidden gems. By default, only "Contradiction".
# - optional language to allow to compute regional rankings of hidden gems. cf. compute_regional_stats.py
# - optional choice of popularity measure: either 'num_owners', or 'num_reviews'
# - optional choice of quality measure: either 'wilson_score' or 'bayesian_rating'
# - optional number of top games to print if the ranking is only partially displayed
# By default, only the top 1000 games are displayed.
# If set to None, the ranking will be fully displayed.
# - optional set of appID of games to show (and only these games are shown).
# Typically used to focus on appIDs for specific genres or tags.
# If None, behavior is unintuitive yet exceptional: every game is shown, appIDs are not filtered-in.
# - optional set of appID of games to hide.
# Typically used to exclude appIDs for specific genres or tags.
# If None, the behavior is intuitive: no game is specifically hidden, appIDs are not filtered-out.
# Output: a 2-tuple consisting of:
# - a scalar value summarizing ranks of games used as references of "hidden gems"
# - the ranking to be ultimately displayed. A list of 3-tuple: (rank, game_name, appid).
# If verbose was set to None, the returned ranking is empty.
if appid_reference_set is None:
appid_reference_set = {APP_ID_CONTRADICTION}
if filtered_app_ids_to_show is None:
filtered_app_ids_to_show = set()
if filtered_app_ids_to_hide is None:
filtered_app_ids_to_hide = set()
# Boolean to decide whether printing the ranking of the top 1000 games, rather than the ranking of the whole Steam
# catalog. It makes the script finish faster, and usually, we are only interested in the top games anyway.
print_subset_of_top_games = bool(num_top_games_to_print is not None)
# Boolean to decide whether there is a filtering-in of appIDs (typically to filter-in genres or tags).
print_filtered_app_ids_only = bool(
filtered_app_ids_to_show is not None and len(filtered_app_ids_to_show) != 0,
)
# Boolean to decide whether there is a filtering-out of appIDs (typically to filter-out genres or tags).
hide_filtered_app_ids_only = bool(
filtered_app_ids_to_hide is not None and len(filtered_app_ids_to_hide) != 0,
)
def compute_score(x):
return compute_score_generic(
x,
parameter_list,
language,
popularity_measure_str,
quality_measure_str,
)
# Rank all the Steam games
sorted_values = sorted(d.values(), key=compute_score, reverse=True)
name_index = 0 if language is None else "name"
sorted_game_names = [x[name_index] for x in sorted_values]
reference_dict = {}
for appid_reference in appid_reference_set:
# Find the rank of this game used as a reference of a "hidden gem"
name_game_ref_for_hidden_gem = d[appid_reference][name_index]
rank_game_used_as_reference_for_hidden_gem = (
sorted_game_names.index(name_game_ref_for_hidden_gem) + 1
)
# Find whether the reference game should appear in the ranking (it might not due to tag filters)
if language is None:
bool_reference_game_should_appear_in_ranking = d[appid_reference][-1]
else:
bool_reference_game_should_appear_in_ranking = True
reference_dict[appid_reference] = [
rank_game_used_as_reference_for_hidden_gem,
bool_reference_game_should_appear_in_ranking,
]
ranks_of_reference_hidden_gems = [v[0] for k, v in reference_dict.items()]
def summarizing_function(x):
return np.average(x)
scalar_summarizing_ranks_of_reference_hidden_gems = summarizing_function(
ranks_of_reference_hidden_gems,
)
# Save the ranking for later display
ranking_list = []
if verbose:
print(
"Objective function to minimize:\t",
scalar_summarizing_ranks_of_reference_hidden_gems,
)
# Populate the variable ranking_list
num_games_to_print = len(sorted_game_names)
if print_subset_of_top_games:
num_games_to_print = min(num_top_games_to_print, num_games_to_print)
for appid_reference in reference_dict:
rank_game_used_as_reference_for_hidden_gem = reference_dict[
appid_reference
][0]
bool_reference_game_should_appear_in_ranking = reference_dict[
appid_reference
][1]
if (not bool_reference_game_should_appear_in_ranking) and bool(
rank_game_used_as_reference_for_hidden_gem <= num_games_to_print,
):
num_games_to_print += 1
# Check
num_games_to_print = min(len(sorted_game_names), num_games_to_print)
rank_decrease = 0
for i in range(num_games_to_print):
game_name = sorted_game_names[i]
appid = next(k for k, v in d.items() if v[name_index] == game_name)
current_rank = i + 1
if appid in reference_dict:
rank_game_used_as_reference_for_hidden_gem = reference_dict[appid][0]
bool_reference_game_should_appear_in_ranking = reference_dict[appid][1]
if not bool_reference_game_should_appear_in_ranking:
if not (current_rank == rank_game_used_as_reference_for_hidden_gem):
raise AssertionError
rank_decrease += 1
continue
current_rank -= rank_decrease
if (
not print_filtered_app_ids_only
or bool(
appid in filtered_app_ids_to_show,
)
) and (
not hide_filtered_app_ids_only
or bool(
appid not in filtered_app_ids_to_hide,
)
):
# Append the ranking info
ranking_list.append([current_rank, game_name, appid])
return scalar_summarizing_ranks_of_reference_hidden_gems, ranking_list
# noinspection PyPep8Naming
def optimize_for_alpha(
d,
verbose=True,
appid_reference_set=None,
language=None,
popularity_measure_str=None,
quality_measure_str=None,
):
# Objective: find the optimal value of the parameter alpha
#
# Input: - local dictionary of data extracted from SteamSpy
# - optional verbosity boolean
# - optional set of appID of games chosen as references of hidden gems. By default, only "Contradiction".
# - optional language to allow to compute regional rankings of hidden gems. cf. compute_regional_stats.py
# - optional choice of popularity measure: either 'num_owners', or 'num_reviews'
# - optional choice of quality measure: either 'wilson_score' or 'bayesian_rating'
# Output: list of optimal parameters (by default, only one parameter is optimized: alpha)
if appid_reference_set is None:
appid_reference_set = {APP_ID_CONTRADICTION}
from math import log10
from scipy.optimize import minimize
# Goal: find the optimal value for alpha by minimizing the rank of games chosen as references of "hidden gems"
def function_to_minimize(x):
return rank_games(
d,
[x],
False,
appid_reference_set,
language,
popularity_measure_str,
quality_measure_str,
)[0]
if language is None:
if popularity_measure_str is None or popularity_measure_str == "num_owners":
vec = [float(game[get_index_num_owners()]) for game in d.values()]
else:
if not (popularity_measure_str == "num_reviews"):
raise AssertionError
vec = [get_num_reviews(game) for game in d.values()]
else:
vec = [game[language][popularity_measure_str] for game in d.values()]
def choose_x0(data_vec):
return 1 + np.max(data_vec)
res = minimize(fun=function_to_minimize, x0=choose_x0(vec), method="Nelder-Mead")
optimal_parameters = [res.x]
alpha = np.squeeze(optimal_parameters[0])
try:
optimal_power = log10(alpha)
if verbose:
print(f"alpha = 10^{optimal_power:.2f}")
except ValueError:
if verbose:
print(f"alpha = {alpha:.2f}")
return optimal_parameters
def save_ranking_to_file(
output_filename,
ranking_list,
only_show_appid=False,
verbose=False,
width=40,
):
# Objective: save the ranking to the output text file
base_steam_store_url = "https://store.steampowered.com/app/"
with Path(output_filename).open("w", encoding="utf8") as outfile:
for current_ranking_info in ranking_list:
current_rank = current_ranking_info[0]
game_name = current_ranking_info[1]
appid = current_ranking_info[-1]
store_url = base_steam_store_url + appid
store_url_fixed_width = f"{store_url: <{width}}"
if only_show_appid:
print(appid, file=outfile)
if verbose:
print(appid)
else:
sentence = f"{current_rank:05}.\t[{game_name}]({store_url_fixed_width})"
print(sentence, file=outfile)
if verbose:
print(sentence)
def get_index_num_owners():
return 3
def get_index_num_positive_reviews():
return 7
def get_index_num_negative_reviews():
return 8
def get_num_reviews(game):
return int(game[get_index_num_positive_reviews()]) + int(
game[get_index_num_negative_reviews()],
)
# noinspection PyPep8Naming
def compute_ranking(
d,
num_top_games_to_print=None,
keywords_to_include=None,
keywords_to_exclude=None,
language=None,
perform_optimization_at_runtime=True,
popularity_measure_str=None,
quality_measure_str=None,
):
# Objective: compute a ranking of hidden gems
#
# Input: - local dictionary of data extracted from SteamSpy
# - maximal length of the ranking
# The higher the value, the longer it takes to compute and print the ranking.
# If set to None, there is no limit, so the whole Steam catalog is ranked.
# - tags to filter-in
# Warning because unintuitive: to avoid filtering-in, please use an empty list.
# - tags to filter-out
# - optional language to allow to compute regional rankings of hidden gems. cf. compute_regional_stats.py
# - bool to decide whether to optimize alpha at run-time, or to rely on a hard-coded value instead
# - optional choice of popularity measure: either 'num_owners', or 'num_reviews'
# - optional choice of quality measure: either 'wilson_score' or 'bayesian_rating'
#
# Output: ranking of hidden gems
if keywords_to_include is None:
keywords_to_include = []
if keywords_to_exclude is None:
keywords_to_exclude = []
from src.appids import appid_hidden_gems_reference_set
from src.download_json import (
get_appid_by_keyword_list_to_exclude,
get_appid_by_keyword_list_to_include,
)
if perform_optimization_at_runtime:
optimal_parameters = optimize_for_alpha(
d,
True,
appid_hidden_gems_reference_set,
language,
popularity_measure_str,
quality_measure_str,
)
elif popularity_measure_str is None or popularity_measure_str == "num_owners":
if quality_measure_str is None or quality_measure_str == "wilson_score":
# Optimal parameter as computed on May 19, 2018
# Objective function to minimize: 2156.36
optimal_parameters = [pow(10, 6.52)]
else:
if not (quality_measure_str == "bayesian_rating"):
raise AssertionError
# Optimal parameter as computed on May 19, 2018
# Objective function to minimize: 1900.00
optimal_parameters = [pow(10, 6.63)]
else:
if not (popularity_measure_str == "num_reviews"):
raise AssertionError
if quality_measure_str is None or quality_measure_str == "wilson_score":
# Optimal parameter as computed on May 19, 2018
# Objective function to minimize: 2372.90
optimal_parameters = [pow(10, 4.83)]
else:
if not (quality_measure_str == "bayesian_rating"):
raise AssertionError
# Optimal parameter as computed on May 19, 2018
# Objective function to minimize: 2094.00
optimal_parameters = [pow(10, 4.89)]
# Filter-in games which meta-data includes ALL the following keywords
# Caveat: the more keywords, the fewer games are filtered-in! cf. intersection of sets in the code
filtered_in_app_ids = get_appid_by_keyword_list_to_include(keywords_to_include)
# Filter-out games which meta-data includes ANY of the following keywords
# NB: the more keywords, the more games are excluded. cf. union of sets in the code
filtered_out_app_ids = get_appid_by_keyword_list_to_exclude(keywords_to_exclude)
(_, ranking) = rank_games(
d,
optimal_parameters,
True,
appid_hidden_gems_reference_set,
language,
popularity_measure_str,
quality_measure_str,
num_top_games_to_print,
filtered_in_app_ids,
filtered_out_app_ids,
)
return ranking
def run_workflow(
quality_measure_str="wilson_score",
popularity_measure_str="num_reviews",
perform_optimization_at_runtime=True,
num_top_games_to_print=250,
verbose=False,
language=None,
keywords_to_include=None,
keywords_to_exclude=None,
):
# Objective: save to disk a ranking of hidden gems.
#
# Input:
# - optional choice of quality measure: either 'wilson_score' or 'bayesian_rating'
# - optional choice of popularity measure: either 'num_owners', or 'num_reviews'
# - bool to decide whether to optimize alpha at run-time, or to rely on a hard-coded value instead
# - maximal length of the ranking
# The higher the value, the longer it takes to compute and print the ranking.
# If set to None, there is no limit, so the whole Steam catalog is ranked.
# - optional language to allow to compute regional rankings of hidden gems
# - tags to filter-in
# Warning because unintuitive: to avoid filtering-in, please use an empty list.
# - tags to filter-out
#
# Output: ranking of hidden gems, printed to screen, and printed to file 'hidden_gems.md'
if keywords_to_include is None:
keywords_to_include = [] # ["Rogue-Like"]
if keywords_to_exclude is None:
keywords_to_exclude = [] # ["Visual Novel", "Anime"]
# A local dictionary was stored in the following text file
input_filename = "dict_top_rated_games_on_steam.txt"
# A ranking, in a format parsable by Github Gist, will be stored in the following text file
output_filename = "hidden_gems.md"
# A ranking, as a list of appids, will be stored in the following text file
output_filename_only_appids = "idlist.txt"
# Import the local dictionary from the input file
with Path(input_filename).open(encoding="utf8") as infile:
lines = infile.readlines()
# The dictionary is on the second line
# noinspection PyPep8Naming
d = ast.literal_eval(lines[1])
ranking = compute_ranking(
d,
num_top_games_to_print,
keywords_to_include,
keywords_to_exclude,
language,
perform_optimization_at_runtime,
popularity_measure_str,
quality_measure_str,
)
save_ranking_to_file(
output_filename,
ranking,
only_show_appid=False,
verbose=verbose,
)
# NB: verbose is set to True, so that I can check the results even with Travis integration on Github.
save_ranking_to_file(
output_filename_only_appids,
ranking,
only_show_appid=True,
verbose=verbose,
)
return True
def main():
run_workflow(
quality_measure_str="wilson_score", # Either 'wilson_score' or 'bayesian_rating'
popularity_measure_str="num_reviews", # Either 'num_reviews' or 'num_owners'
perform_optimization_at_runtime=True,
num_top_games_to_print=1000,
verbose=False,
language=None,
keywords_to_include=None,
keywords_to_exclude=None,
)
return True
if __name__ == "__main__":
main()