From e9cf56a70e6ff14c9903f25ca7379ef4a974533c Mon Sep 17 00:00:00 2001 From: Christovis Date: Sat, 1 May 2021 22:33:38 +0100 Subject: [PATCH 1/4] address YAMLLoadWarning and change to safe_load --- bigbang/mailman.py | 2 +- config/config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigbang/mailman.py b/bigbang/mailman.py index b257ea8c..8d3804b3 100644 --- a/bigbang/mailman.py +++ b/bigbang/mailman.py @@ -259,7 +259,7 @@ def access_provenance(directory): file_path = os.path.join(directory, PROVENANCE_FILENAME) if os.path.isfile(file_path): # a provenance file already exists file_handle = open(file_path, "r") - provenance = yaml.load(file_handle) + provenance = yaml.safe_load(file_handle) return provenance return None diff --git a/config/config.py b/config/config.py index e2cd1ebc..d76a3a29 100644 --- a/config/config.py +++ b/config/config.py @@ -5,7 +5,7 @@ base_loc = os.path.abspath(os.path.join(file_path, os.pardir)) # parent directory of config directory config_filepath = os.path.join(base_loc, "config", "config.yml") stream = open(config_filepath, "r") -dictionary = yaml.load(stream) +dictionary = yaml.safe_load(stream) class Config(object): def __init__(self, conf): From 3a5e6e3a291516c1d9b351dff36cf9486652e85b Mon Sep 17 00:00:00 2001 From: Christovis Date: Sun, 2 May 2021 01:00:38 +0100 Subject: [PATCH 2/4] scraping from server through collectmail files works --- bigbang/listserv.py | 19 +++++++++++------- bigbang/mailman.py | 48 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 49 insertions(+), 18 deletions(-) diff --git a/bigbang/listserv.py b/bigbang/listserv.py index 3b8ff252..58b8efcc 100644 --- a/bigbang/listserv.py +++ b/bigbang/listserv.py @@ -961,6 +961,7 @@ def from_mailing_lists( login: Optional[Dict[str, str]] = {"username": None, "password": None}, session: Optional[str] = None, only_mlist_urls: bool = True, + instant_save: Optional[bool] = True, ) -> "ListservArchive": """ Create ListservArchive from a given list of 'ListservList'. @@ -976,14 +977,18 @@ def from_mailing_lists( session = get_auth_session(url_login, **login) lists = [] for idx, url in enumerate(url_mailing_lists): - lists.append( - ListservList.from_url( - name=idx, - url=url, - select=select, - session=session, - ) + mlist = ListservList.from_url( + name=idx, + url=url, + select=select, + session=session, ) + if len(mlist) != 0: + if instant_save: + mlist.to_mbox(dir_out=CONFIG.mail_path) + else: + logger.info(f"Recorded the list {mlist.name}.") + lists.append(mlist) else: lists = url_mailing_lists return cls(name, url_root, lists) diff --git a/bigbang/mailman.py b/bigbang/mailman.py index 8d3804b3..528e385f 100644 --- a/bigbang/mailman.py +++ b/bigbang/mailman.py @@ -10,8 +10,10 @@ import urllib.error import urllib.parse import urllib.request +from urllib.parse import urlparse import warnings from pprint import pprint as pp +from typing import Union import pandas as pd import yaml @@ -27,7 +29,8 @@ gz_exp = re.compile(r'href="(\d\d\d\d-\w*\.txt\.gz)"') ietf_ml_exp = re.compile(r'href="([\d-]+.mail)"') w3c_archives_exp = re.compile(r"lists\.w3\.org") -listserv_archives_exp = re.compile(r"list\.etsi\.org") +tgpp_archives_exp = re.compile(r'list\.etsi\.org') +ieee_archives_exp = re.compile(r'listserv\.ieee\.org') mailing_list_path_expressions = [gz_exp, ietf_ml_exp, txt_exp] @@ -93,11 +96,12 @@ def load_data( def collect_from_url( - url: str, archive_dir: str = CONFIG.mail_path, notes=None + url: Union[list, str], archive_dir: str = CONFIG.mail_path, notes=None ): """Collect data from a given url.""" - url = url.rstrip() + if isinstance(url, str): + url = url.rstrip() try: has_archives = collect_archive_from_url( url, archive_dir=archive_dir, notes=notes @@ -160,8 +164,13 @@ def collect_from_file( ): """Collect urls from a file.""" urls = urls_to_collect(urls_file) - for url in urls: - collect_from_url(url, archive_dir=archive_dir, notes=notes) + if tgpp_archives_exp.search(urls[0]): + collect_from_url(urls, archive_dir=archive_dir, notes=notes) + elif ieee_archives_exp.search(urls[0]): + collect_from_url(urls, archive_dir=archive_dir, notes=notes) + else: + for url in urls: + collect_from_url(url, archive_dir=archive_dir, notes=notes) def get_list_name(url): @@ -273,24 +282,41 @@ def update_provenance(directory, provenance): file_handle.close() -def collect_archive_from_url(url, archive_dir=CONFIG.mail_path, notes=None): +def collect_archive_from_url( + url: Union[list, str], archive_dir=CONFIG.mail_path, notes=None, +): """ Collect archives (generally tar.gz) files from mailmain archive page. Return True if archives were downloaded, False otherwise (for example if the page lists no accessible archive files). """ - list_name = get_list_name(url) - logging.info("Getting archive page for %s", list_name) + if isinstance(url, str): + list_name = get_list_name(url) + logging.info("Getting archive page for %s", list_name) + elif isinstance(url, list): + urls = url + url = url[0] + url_root = "https://" + urlparse(url).hostname if w3c_archives_exp.search(url): return w3crawl.collect_from_url(url, archive_dir, notes=notes) - elif listserv_archives_exp.search(url): - listserv.ListservArchive.from_url( + elif tgpp_archives_exp.search(url): + listserv.ListservArchive.from_mailing_lists( name="3GPP", url_root=url, url_home=url + "HOME", - instant_dump=True, + instant_save=True, + only_mlist_urls=False, + ) + elif ieee_archives_exp.search(url): + listserv.ListservArchive.from_mailing_lists( + name="IEEE", + url_root=url_root, + url_mailing_lists=urls, + login={'username': '...', 'password': '...'}, + only_mlist_urls=False, + instant_save=True, ) response = urllib.request.urlopen(url) From f9c01eb2525cdd0517b8c99dbed85866786d1e09 Mon Sep 17 00:00:00 2001 From: Christovis Date: Mon, 3 May 2021 23:22:21 +0100 Subject: [PATCH 3/4] fixed file paths --- bigbang/listserv.py | 21 ++++++++++++++++----- bigbang/mailman.py | 11 ++++++----- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/bigbang/listserv.py b/bigbang/listserv.py index 58b8efcc..240be3f1 100644 --- a/bigbang/listserv.py +++ b/bigbang/listserv.py @@ -834,7 +834,7 @@ def to_mbox(self, dir_out: str, filename: Optional[str] = None): filepath = f"{dir_out}/{self.name}.mbox" else: filepath = f"{dir_out}/{filename}.mbox" - logger.info(f"The list {self.name} is save at {filepath}.") + logger.info(f"The list {self.name} is saved at {filepath}.") first = True for msg in self.messages: if first: @@ -976,16 +976,22 @@ def from_mailing_lists( if session is None: session = get_auth_session(url_login, **login) lists = [] - for idx, url in enumerate(url_mailing_lists): + for url in url_mailing_lists: + mlist_name = url.split('A0=')[-1] mlist = ListservList.from_url( - name=idx, + name=mlist_name, url=url, select=select, session=session, ) if len(mlist) != 0: if instant_save: - mlist.to_mbox(dir_out=CONFIG.mail_path) + dir_out = CONFIG.mail_path + name + try: + os.mkdir(dir_out) + except FileExistsError: + pass # temporary directory already exists, that's cool + mlist.to_mbox(dir_out=dir_out) else: logger.info(f"Recorded the list {mlist.name}.") lists.append(mlist) @@ -1001,7 +1007,7 @@ def from_listserv_directory( folderdsc: str = "*", filedsc: str = "*.LOG?????", select: Optional[dict] = None, - ) -> "ListservList": + ) -> "ListservArchive": """ Args: name: Name of the archive, e.g. '3GPP'. @@ -1080,6 +1086,11 @@ def get_lists_from_url( ) if len(mlist) != 0: if instant_save: + dir_out = CONFIG.mail_path + name + try: + os.mkdir(dir_out) + except FileExistsError: + pass # temporary directory already exists, that's cool mlist.to_mbox(dir_out=CONFIG.mail_path) archive.append(mlist.name) else: diff --git a/bigbang/mailman.py b/bigbang/mailman.py index 528e385f..c31de38d 100644 --- a/bigbang/mailman.py +++ b/bigbang/mailman.py @@ -302,15 +302,16 @@ def collect_archive_from_url( if w3c_archives_exp.search(url): return w3crawl.collect_from_url(url, archive_dir, notes=notes) elif tgpp_archives_exp.search(url): - listserv.ListservArchive.from_mailing_lists( + return listserv.ListservArchive.from_mailing_lists( name="3GPP", - url_root=url, - url_home=url + "HOME", - instant_save=True, + url_root=url_root, + url_mailing_lists=urls, + login={'username': '...', 'password': '...'}, only_mlist_urls=False, + instant_save=True, ) elif ieee_archives_exp.search(url): - listserv.ListservArchive.from_mailing_lists( + return listserv.ListservArchive.from_mailing_lists( name="IEEE", url_root=url_root, url_mailing_lists=urls, From aed908fc70b3923aa07efd522497d848b6aeccd8 Mon Sep 17 00:00:00 2001 From: Christovis Date: Tue, 4 May 2021 08:27:23 +0100 Subject: [PATCH 4/4] use os.path.isdir instead of catching error --- bigbang/listserv.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/bigbang/listserv.py b/bigbang/listserv.py index 240be3f1..4e851c14 100644 --- a/bigbang/listserv.py +++ b/bigbang/listserv.py @@ -987,10 +987,8 @@ def from_mailing_lists( if len(mlist) != 0: if instant_save: dir_out = CONFIG.mail_path + name - try: + if os.path.isdir(dir_out) is False: os.mkdir(dir_out) - except FileExistsError: - pass # temporary directory already exists, that's cool mlist.to_mbox(dir_out=dir_out) else: logger.info(f"Recorded the list {mlist.name}.") @@ -1087,10 +1085,8 @@ def get_lists_from_url( if len(mlist) != 0: if instant_save: dir_out = CONFIG.mail_path + name - try: + if os.path.isdir(dir_out) is False: os.mkdir(dir_out) - except FileExistsError: - pass # temporary directory already exists, that's cool mlist.to_mbox(dir_out=CONFIG.mail_path) archive.append(mlist.name) else: