From 275357ee0ee530d5de65175c8ea9a213b9af83a1 Mon Sep 17 00:00:00 2001 From: Alexey Lisikhin Date: Fri, 14 Apr 2017 21:12:06 +0800 Subject: [PATCH] Remove analytics code from the project --- README.md | 8 - project/analytics/__init__.py | 1 - project/analytics/cases/__init__.py | 1 - project/analytics/cases/count_of_games.py | 38 -- .../cases/fold_against_honitsu_hands.py | 56 --- project/analytics/cases/honitsu_hands.py | 333 ------------------ project/analytics/cases/main.py | 118 ------- project/analytics/debug.py | 42 --- project/analytics/download_game_ids.py | 201 ----------- project/analytics/download_logs_content.py | 92 ----- project/analytics/reproduce_played_round.py | 112 ------ project/process.py | 61 ---- 12 files changed, 1063 deletions(-) delete mode 100644 project/analytics/__init__.py delete mode 100644 project/analytics/cases/__init__.py delete mode 100644 project/analytics/cases/count_of_games.py delete mode 100644 project/analytics/cases/fold_against_honitsu_hands.py delete mode 100644 project/analytics/cases/honitsu_hands.py delete mode 100644 project/analytics/cases/main.py delete mode 100644 project/analytics/debug.py delete mode 100644 project/analytics/download_game_ids.py delete mode 100644 project/analytics/download_logs_content.py delete mode 100644 project/analytics/reproduce_played_round.py delete mode 100644 project/process.py diff --git a/README.md b/README.md index 5bb4ee35..208009fb 100644 --- a/README.md +++ b/README.md @@ -58,14 +58,6 @@ It will allow to determine was a new version improved or not. To be able to run it you need to copy an old ai version to the `mahjong/ai/old_version.py` and run `bots_battle.py`. -## Tenhou logs analytics - -We have an analytics package. It contains: - -- `analytics/download_game_ids.py` - script to load phoenix game ids from the last 7 days -- `analytics/download_logs_content.py` - script to download logs content for ids that were obtained from the previous script -- `analytics/cases/*` - package for different analytics scripts (like % of suji traps) that will be written later - # For developers ## How to run it? diff --git a/project/analytics/__init__.py b/project/analytics/__init__.py deleted file mode 100644 index 40a96afc..00000000 --- a/project/analytics/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- diff --git a/project/analytics/cases/__init__.py b/project/analytics/cases/__init__.py deleted file mode 100644 index 40a96afc..00000000 --- a/project/analytics/cases/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- diff --git a/project/analytics/cases/count_of_games.py b/project/analytics/cases/count_of_games.py deleted file mode 100644 index ab4e2e17..00000000 --- a/project/analytics/cases/count_of_games.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -import sqlite3 - -import logging - -from analytics.cases.main import ProcessDataCase - -logger = logging.getLogger('process') - - -class CountOfGames(ProcessDataCase): - - def process(self): - connection = sqlite3.connect(self.db_file) - - with connection: - cursor = connection.cursor() - - total_games_sql = 'SELECT count(*) from logs' - hanchan_games_sql = 'SELECT count(*) from logs where is_tonpusen = 0;' - - cursor.execute(total_games_sql) - data = cursor.fetchone() - total_games = data and data[0] or 0 - - cursor.execute(hanchan_games_sql) - data = cursor.fetchone() - hanchan_games = data and data[0] or 0 - - tonpusen_games = total_games - hanchan_games - - hanchan_percentage = total_games and (hanchan_games / total_games) * 100 or 0 - tonpusen_percentage = total_games and (tonpusen_games / total_games) * 100 or 0 - - logger.info('Total games: {}'.format(total_games)) - logger.info('Hanchan games: {}, {:.2f}%'.format(hanchan_games, hanchan_percentage)) - logger.info('Tonpusen games: {}, {:.2f}%'.format(tonpusen_games, tonpusen_percentage)) - logger.info('') diff --git a/project/analytics/cases/fold_against_honitsu_hands.py b/project/analytics/cases/fold_against_honitsu_hands.py deleted file mode 100644 index 9c72c4ba..00000000 --- a/project/analytics/cases/fold_against_honitsu_hands.py +++ /dev/null @@ -1,56 +0,0 @@ -# -*- coding: utf-8 -*- - -import logging -import sqlite3 - -from analytics.cases.main import ProcessDataCase - -logger = logging.getLogger('process') - - -class AnalyzeHonitsuHands(ProcessDataCase): - HONITSU_ID = 34 - CHINITSU_ID = 35 - - def process(self): - self.calculate_statistics() - logger.info('Done') - - def calculate_statistics(self): - results = self._load_data() - count_of_records = len(results) - - shanten_stat = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0} - for result in results: - shanten_stat[result['open_hand_shanten']] += 1 - - for key in shanten_stat.keys(): - shanten_stat[key] = (shanten_stat[key] / count_of_records) * 100 - - logger.info('Count of records: {}'.format(count_of_records)) - logger.info('Shanten stat: {}'.format(list(shanten_stat.values()))) - - def _load_data(self): - logger.info('Loading data...') - - results = [] - connection = sqlite3.connect(self.db_file) - with connection: - cursor = connection.cursor() - cursor.execute('SELECT * from data WHERE yaku = 34 and is_open_hand = 1;') - data = cursor.fetchall() - - keys = ['is_honor_or_suit_dora', 'is_open_hand', 'is_tonpusen', 'is_tsumo', - 'log_id', 'open_hand_shanten', 'open_hand_step', - 'round_content', 'round_number', 'scores', 'suit', - 'tempai_step', 'winner', 'winner_position', 'yaku', 'year'] - - for result in data: - dict_result = {} - for key_index in range(0, len(keys)): - key = keys[key_index] - dict_result[key] = result[key_index] - - results.append(dict_result) - - return results diff --git a/project/analytics/cases/honitsu_hands.py b/project/analytics/cases/honitsu_hands.py deleted file mode 100644 index 6e806b4c..00000000 --- a/project/analytics/cases/honitsu_hands.py +++ /dev/null @@ -1,333 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import re - -import logging - -import sqlite3 -from distutils.dir_util import mkpath - -from analytics.cases.main import ProcessDataCase -from mahjong.ai.shanten import Shanten -from mahjong.meld import Meld -from mahjong.tile import TilesConverter -from mahjong.utils import is_man, is_pin, is_sou, is_honor - -logger = logging.getLogger('process') - - -class HonitsuHands(ProcessDataCase): - HONITSU_ID = 34 - CHINITSU_ID = 35 - - YAKU_NAMES = { - HONITSU_ID: 'honitsu', - CHINITSU_ID: 'chinitsu', - } - - SUIT_NAMES = [ - 'man', - 'pin', - 'sou' - ] - - # order is important - suits = [ - is_man, - is_pin, - is_sou - ] - - def __init__(self, db_file): - super().__init__(db_file) - - db_directory = os.path.join(self.current_directory, '..', 'temp') - self.local_db_file = os.path.join(db_directory, 'honitsu.db') - if not os.path.exists(db_directory): - mkpath(db_directory) - - def process(self): - self.prepare_data() - # self.analyze_prepared_data() - logger.info('Done') - - def prepare_data(self): - self._set_up_database() - - self.load_all_records() - - filtered_rounds = self.filter_rounds() - filtered_rounds = self._collect_round_data(filtered_rounds) - self._save_data(filtered_rounds) - - def analyze_prepared_data(self): - results = self._load_data() - - for result in results[:10]: - self._debug_honitsu_data(result) - - def filter_rounds(self): - """ - Find all rounds that were ended with honitsu or chinitsu hands - """ - logger.info('Filtering rounds...') - - filtered_rounds = [] - - for game in self.games: - for round_number in range(0, len(game.rounds)): - round_content = game.rounds[round_number] - for tag in round_content: - if 'AGARI' in tag and 'yaku=' in tag: - yaku_temp = self.decoder.get_attribute_content(tag, 'yaku').split(',') - # start at the beginning at take every second item (even) - yaku_list = [int(x) for x in yaku_temp[::2]] - - if self.HONITSU_ID in yaku_list or self.CHINITSU_ID in yaku_list: - # handle double ron - for x in round_content: - if 'AGARI' in x and x != tag: - round_content.remove(x) - - filtered_rounds.append({ - 'log_id': game.log_id, - 'year': game.year, - 'is_tonpusen': game.is_tonpusen, - 'round_content': round_content, - 'round_number': round_number, - }) - - logger.info('Found {} filtered rounds'.format(len(filtered_rounds))) - - return filtered_rounds - - def _collect_round_data(self, filtered_rounds): - logger.info('Collecting rounds data...') - draw_tags = ['T', 'U', 'V', 'W'] - discard_tags = ['D', 'E', 'F', 'G'] - - shanten = Shanten() - - for round_item in filtered_rounds: - content = round_item['round_content'] - # it is important to find winner before data processing - # to collect all data related to this player - winner = None - for tag in content: - if 'AGARI' in tag: - winner = int(self.decoder.get_attribute_content(tag, 'who')) - break - - revealed_tiles = 0 - open_hand_shanten = None - open_hand_step = None - tempai_step = None - player_hand = [] - winner_draw_regex = re.compile('^<[{}]+\d*'.format(draw_tags[winner])) - winner_discard_regex = re.compile('^<[{}]+\d*'.format(discard_tags[winner])) - draw_regex = re.compile(r'^<[TUVW]+\d*') - for tag in content: - if 'INIT' in tag: - player_hand = self._parse_initial_hand(tag, winner) - - # we need to count revealed tiles - # to be able associate action with game "step" - if draw_regex.match(tag) and 'UN' not in tag: - revealed_tiles += 1 - - if winner_draw_regex.match(tag) and 'UN' not in tag: - tile = self.decoder.parse_tile(tag) - player_hand.append(tile) - - if winner_discard_regex.match(tag) and 'DORA' not in tag: - shanten_number = shanten.calculate_shanten(TilesConverter.to_34_array(player_hand)) - if shanten_number == 0: - tempai_step = revealed_tiles // 4 - - tile = self.decoder.parse_tile(tag) - player_hand.remove(tile) - - if '': - tag = self.content[tag_start:x+1] - tag_start = x + 1 - - # not useful tags - if tag and ('mjloggm' in tag or 'TAIKYOKU' in tag): - tag = None - - # new round was started - if tag and 'INIT' in tag: - self.rounds.append(game_round) - game_round = [] - self.total_rounds += 1 - - # the end of the game - if tag and 'owari' in tag: - self.rounds.append(game_round) - - if tag: - # to save some memory we can remove not needed information from logs - if 'INIT' in tag: - # we dont need seed information - find = re.compile(r'shuffle="[^"]*"') - tag = find.sub('', tag) - - # add processed tag to the round - game_round.append(tag) - tag = None - - # first element is player names, ranks and etc. - # we shouldn't consider it as game round - # and for now let's not save it - self.rounds = self.rounds[1:] - - -class ProcessDataCase(object): - decoder = None - db_file = None - games = None - - def __init__(self, db_file): - self.db_file = db_file - self.decoder = TenhouDecoder() - self.current_directory = os.path.dirname(os.path.realpath(__file__)) - - self.games = [] - - def process(self): - raise NotImplemented() - - def load_all_records(self): - # useful for debugging - limit = None - logger.info('Loading data...') - - connection = sqlite3.connect(self.db_file) - - with connection: - cursor = connection.cursor() - - if limit: - cursor.execute("""SELECT log_id, is_tonpusen, log_content FROM logs - WHERE is_processed = 1 and was_error = 0 LIMIT ?;""", [limit]) - else: - cursor.execute("""SELECT log_id, is_tonpusen, log_content FROM logs - WHERE is_processed = 1 and was_error = 0;""") - - data = cursor.fetchall() - - logger.info('Found {} games records'.format(len(data))) - - logger.info('Unzipping and processing games data...') - for item in data: - self.games.append(Game(item[0], item[1] == 1, item[2])) - - total_rounds = 0 - for hanchan in self.games: - total_rounds += hanchan.total_rounds - - logger.info('Found {} rounds'.format(total_rounds)) diff --git a/project/analytics/debug.py b/project/analytics/debug.py deleted file mode 100644 index 63dd6c9f..00000000 --- a/project/analytics/debug.py +++ /dev/null @@ -1,42 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import sqlite3 -import sys - -db_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'db') -db_file = '' - - -def main(): - parse_command_line_arguments() - - connection = sqlite3.connect(db_file) - - with connection: - cursor = connection.cursor() - - cursor.execute('SELECT COUNT(*) from logs;') - total = cursor.fetchone()[0] - - cursor.execute('SELECT COUNT(*) from logs where is_processed = 1;') - processed = cursor.fetchone()[0] - - cursor.execute('SELECT COUNT(*) from logs where was_error = 1;') - with_errors = cursor.fetchone()[0] - - print('Total: {}'.format(total)) - print('Processed: {}'.format(processed)) - print('With errors: {}'.format(with_errors)) - - -def parse_command_line_arguments(): - if len(sys.argv) > 1: - year = sys.argv[1] - else: - year = '2017' - - global db_file - db_file = os.path.join(db_folder, '{}.db'.format(year)) - -if __name__ == '__main__': - main() diff --git a/project/analytics/download_game_ids.py b/project/analytics/download_game_ids.py deleted file mode 100644 index 8cd94424..00000000 --- a/project/analytics/download_game_ids.py +++ /dev/null @@ -1,201 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Script to download latest phoenix games and store their ids in the database -We can run it once a day or so, to get new data -""" -import shutil -from datetime import datetime -import calendar -import gzip -import os - -import sqlite3 -from distutils.dir_util import mkpath - -import requests -import sys - -current_directory = os.path.dirname(os.path.realpath(__file__)) -logs_directory = os.path.join(current_directory, 'data', 'logs') -db_folder = os.path.join(current_directory, 'db') -db_file = '' - -if not os.path.exists(logs_directory): - mkpath(logs_directory) - -if not os.path.exists(db_folder): - mkpath(db_folder) - - -def main(): - parse_command_line_arguments() - - # for the initial set up - # set_up_database() - - download_game_ids() - - results = process_local_files() - if results: - add_logs_to_database(results) - - -def process_local_files(): - """ - Function to process scc*.html files that can be obtained - from the annual archives with logs or from latest phoenix games - """ - print('Preparing the list of games') - - results = [] - for file_name in os.listdir(logs_directory): - if 'scc' not in file_name: - continue - - # after 2013 tenhou produced compressed logs - if '.gz' in file_name: - with gzip.open(os.path.join(logs_directory, file_name), 'r') as f: - for line in f: - line = str(line, 'utf-8') - _process_log_line(line, results) - else: - with open(os.path.join(logs_directory, file_name)) as f: - for line in f: - _process_log_line(line, results) - - print('Found {} games'.format(len(results))) - - shutil.rmtree(logs_directory) - - return results - - -def download_game_ids(): - """ - Download latest phoenix games from tenhou - """ - connection = sqlite3.connect(db_file) - - last_name = '' - with connection: - cursor = connection.cursor() - cursor.execute('SELECT * FROM last_downloads ORDER BY date DESC LIMIT 1;') - data = cursor.fetchone() - if data: - last_name = data[0] - - download_url = 'http://tenhou.net/sc/raw/dat/' - url = 'http://tenhou.net/sc/raw/list.cgi' - - response = requests.get(url) - response = response.text.replace('list(', '').replace(');', '') - response = response.split(',\r\n') - - records_was_added = False - for archive_name in response: - if 'scc' in archive_name: - archive_name = archive_name.split("',")[0].replace("{file:'", '') - - file_name = archive_name - if '/' in file_name: - file_name = file_name.split('/')[1] - - if file_name > last_name: - last_name = file_name - records_was_added = True - - archive_path = os.path.join(logs_directory, file_name) - if not os.path.exists(archive_path): - print('Downloading... {}'.format(archive_name)) - - url = '{}{}'.format(download_url, archive_name) - page = requests.get(url) - with open(archive_path, 'wb') as f: - f.write(page.content) - - if records_was_added: - unix_time = calendar.timegm(datetime.utcnow().utctimetuple()) - with connection: - cursor = connection.cursor() - cursor.execute('INSERT INTO last_downloads VALUES (?, ?);', [last_name, unix_time]) - - -def _process_log_line(line, results): - line = line.strip() - # sometimes there is empty lines in the file - if not line: - return None - - result = line.split('|') - game_type = result[2].strip() - - # we don't need hirosima replays for now - if game_type.startswith('三'): - return None - - # example: 牌譜 - game_id = result[3].split('log=')[1].split('"')[0] - - # example: 四鳳東喰赤 - is_tonpusen = game_type[2] == '東' - - results.append([game_id, is_tonpusen]) - - -def set_up_database(): - """ - Init logs table and add basic indices - :return: - """ - if os.path.exists(db_file): - print('Remove old database') - os.remove(db_file) - - connection = sqlite3.connect(db_file) - - print('Set up new database') - with connection: - cursor = connection.cursor() - cursor.execute(""" - CREATE TABLE logs(log_id text primary key, - is_tonpusen int, - is_processed int, - was_error int, - log_content text); - """) - cursor.execute("CREATE INDEX is_tonpusen_index ON logs (is_tonpusen);") - cursor.execute("CREATE INDEX is_processed_index ON logs (is_processed);") - cursor.execute("CREATE INDEX was_error_index ON logs (was_error);") - - cursor.execute(""" - CREATE TABLE last_downloads(name text, - date int); - """) - - -def add_logs_to_database(results): - """ - Store logs to the sqllite3 database - """ - print('Inserting new values to the database') - connection = sqlite3.connect(db_file) - with connection: - cursor = connection.cursor() - - for item in results: - cursor.execute('INSERT INTO logs VALUES (?, ?, 0, 0, "");', [item[0], - item[1] and 1 or 0]) - - -def parse_command_line_arguments(): - if len(sys.argv) > 1: - year = sys.argv[1] - else: - year = '2017' - - global db_file - db_file = os.path.join(db_folder, '{}.db'.format(year)) - - -if __name__ == '__main__': - main() diff --git a/project/analytics/download_logs_content.py b/project/analytics/download_logs_content.py deleted file mode 100644 index d331a9ed..00000000 --- a/project/analytics/download_logs_content.py +++ /dev/null @@ -1,92 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Script will load log ids from the database and will download log content -""" -import bz2 -import os -import sqlite3 - -import requests -import sys - -db_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'db') -db_file = '' - - -def main(): - parse_command_line_arguments() - - should_continue = True - while should_continue: - try: - limit = 50 - print('Load {} records'.format(limit)) - results = load_not_processed_logs(limit) - if not results: - should_continue = False - - for log_id in results: - print('Process {}'.format(log_id)) - download_log_content(log_id) - except KeyboardInterrupt: - should_continue = False - - -def download_log_content(log_id): - """ - We will download log content and will store it in the file, - also we will store compressed version in the database - """ - url = 'http://e.mjv.jp/0/log/?{0}'.format(log_id) - - binary_content = None - was_error = False - try: - response = requests.get(url) - binary_content = response.content - if 'mjlog' not in response.text: - was_error = True - except Exception as e: - was_error = True - - connection = sqlite3.connect(db_file) - - with connection: - cursor = connection.cursor() - - compressed_content = '' - if not was_error: - try: - compressed_content = bz2.compress(binary_content) - except: - was_error = True - - cursor.execute('UPDATE logs SET is_processed = ?, was_error = ?, log_content = ? WHERE log_id = ?;', - [1, was_error and 1 or 0, compressed_content, log_id]) - - print('Was errors: {}'.format(was_error)) - - -def load_not_processed_logs(limit): - connection = sqlite3.connect(db_file) - - with connection: - cursor = connection.cursor() - cursor.execute('SELECT log_id FROM logs where is_processed = 0 and was_error = 0 LIMIT ?;', [limit]) - data = cursor.fetchall() - results = [x[0] for x in data] - - return results - - -def parse_command_line_arguments(): - if len(sys.argv) > 1: - year = sys.argv[1] - else: - year = '2017' - - global db_file - db_file = os.path.join(db_folder, '{}.db'.format(year)) - -if __name__ == '__main__': - main() diff --git a/project/analytics/reproduce_played_round.py b/project/analytics/reproduce_played_round.py deleted file mode 100644 index fa3d1425..00000000 --- a/project/analytics/reproduce_played_round.py +++ /dev/null @@ -1,112 +0,0 @@ -import re - -from mahjong.ai.discard import DiscardOption -from mahjong.meld import Meld -from mahjong.table import Table -from mahjong.tile import TilesConverter -from tenhou.decoder import TenhouDecoder - - -class Reproducer(object): - round_content = None - player_position = None - stop_tag = None - - def __init__(self, round_content, player_position, stop_tag): - """ - :param round_content: array of round tags - :param player_position: position of the player that will be our bot - """ - self.round_content = round_content - self.player_position = player_position - self.stop_tag = stop_tag - self.decoder = TenhouDecoder() - - def reproduce(self, display_tags=False): - draw_tags = ['T', 'U', 'V', 'W'] - discard_tags = ['D', 'E', 'F', 'G'] - - player_draw = draw_tags[self.player_position] - - player_draw_regex = re.compile('^<[{}]+\d*'.format(''.join(player_draw))) - discard_regex = re.compile('^<[{}]+\d*'.format(''.join(discard_tags))) - - table = Table() - for tag in self.round_content: - if display_tags: - print(tag) - - if not display_tags and tag == self.stop_tag: - break - - if 'INIT' in tag: - values = self.decoder.parse_initial_values(tag) - - shifted_scores = [] - for x in range(0, 4): - shifted_scores.append(values['scores'][self.normalize_position(x, self.player_position)]) - - table.init_round( - values['round_number'], - values['count_of_honba_sticks'], - values['count_of_riichi_sticks'], - values['dora_indicator'], - self.normalize_position(self.player_position, values['dealer']), - shifted_scores, - ) - - hands = [ - [int(x) for x in self.decoder.get_attribute_content(tag, 'hai0').split(',')], - [int(x) for x in self.decoder.get_attribute_content(tag, 'hai1').split(',')], - [int(x) for x in self.decoder.get_attribute_content(tag, 'hai2').split(',')], - [int(x) for x in self.decoder.get_attribute_content(tag, 'hai3').split(',')], - ] - - table.player.init_hand(hands[self.player_position]) - - if player_draw_regex.match(tag) and 'UN' not in tag: - tile = self.decoder.parse_tile(tag) - table.player.draw_tile(tile) - - if discard_regex.match(tag) and 'DORA' not in tag: - tile = self.decoder.parse_tile(tag) - player_sign = tag.upper()[1] - player_seat = self.normalize_position(self.player_position, discard_tags.index(player_sign)) - - if player_seat == 0: - table.player.discard_tile(DiscardOption(table.player, tile // 4, 0, [], 0)) - else: - table.add_discarded_tile(player_seat, tile, False) - - if '