From e9cf56a70e6ff14c9903f25ca7379ef4a974533c Mon Sep 17 00:00:00 2001
From: Christovis <christoph.becker@durham.ac.uk>
Date: Sat, 1 May 2021 22:33:38 +0100
Subject: [PATCH 1/4] address YAMLLoadWarning and change to safe_load

---
 bigbang/mailman.py | 2 +-
 config/config.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bigbang/mailman.py b/bigbang/mailman.py
index b257ea8c..8d3804b3 100644
--- a/bigbang/mailman.py
+++ b/bigbang/mailman.py
@@ -259,7 +259,7 @@ def access_provenance(directory):
     file_path = os.path.join(directory, PROVENANCE_FILENAME)
     if os.path.isfile(file_path):  # a provenance file already exists
         file_handle = open(file_path, "r")
-        provenance = yaml.load(file_handle)
+        provenance = yaml.safe_load(file_handle)
         return provenance
     return None
 
diff --git a/config/config.py b/config/config.py
index e2cd1ebc..d76a3a29 100644
--- a/config/config.py
+++ b/config/config.py
@@ -5,7 +5,7 @@
 base_loc = os.path.abspath(os.path.join(file_path, os.pardir)) # parent directory of config directory
 config_filepath = os.path.join(base_loc, "config", "config.yml")
 stream = open(config_filepath, "r")
-dictionary = yaml.load(stream)
+dictionary = yaml.safe_load(stream)
 
 class Config(object):
 	def __init__(self, conf):

From 3a5e6e3a291516c1d9b351dff36cf9486652e85b Mon Sep 17 00:00:00 2001
From: Christovis <christoph.becker@durham.ac.uk>
Date: Sun, 2 May 2021 01:00:38 +0100
Subject: [PATCH 2/4] scraping from server through collectmail files works

---
 bigbang/listserv.py | 19 +++++++++++-------
 bigbang/mailman.py  | 48 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/bigbang/listserv.py b/bigbang/listserv.py
index 3b8ff252..58b8efcc 100644
--- a/bigbang/listserv.py
+++ b/bigbang/listserv.py
@@ -961,6 +961,7 @@ def from_mailing_lists(
         login: Optional[Dict[str, str]] = {"username": None, "password": None},
         session: Optional[str] = None,
         only_mlist_urls: bool = True,
+        instant_save: Optional[bool] = True,
     ) -> "ListservArchive":
         """
         Create ListservArchive from a given list of 'ListservList'.
@@ -976,14 +977,18 @@ def from_mailing_lists(
                 session = get_auth_session(url_login, **login)
             lists = []
             for idx, url in enumerate(url_mailing_lists):
-                lists.append(
-                    ListservList.from_url(
-                        name=idx,
-                        url=url,
-                        select=select,
-                        session=session,
-                    )
+                mlist = ListservList.from_url(
+                    name=idx,
+                    url=url,
+                    select=select,
+                    session=session,
                 )
+                if len(mlist) != 0:
+                    if instant_save:
+                        mlist.to_mbox(dir_out=CONFIG.mail_path)
+                    else:
+                        logger.info(f"Recorded the list {mlist.name}.")
+                        lists.append(mlist)
         else:
             lists = url_mailing_lists
         return cls(name, url_root, lists)
diff --git a/bigbang/mailman.py b/bigbang/mailman.py
index 8d3804b3..528e385f 100644
--- a/bigbang/mailman.py
+++ b/bigbang/mailman.py
@@ -10,8 +10,10 @@
 import urllib.error
 import urllib.parse
 import urllib.request
+from urllib.parse import urlparse
 import warnings
 from pprint import pprint as pp
+from typing import Union
 
 import pandas as pd
 import yaml
@@ -27,7 +29,8 @@
 gz_exp = re.compile(r'href="(\d\d\d\d-\w*\.txt\.gz)"')
 ietf_ml_exp = re.compile(r'href="([\d-]+.mail)"')
 w3c_archives_exp = re.compile(r"lists\.w3\.org")
-listserv_archives_exp = re.compile(r"list\.etsi\.org")
+tgpp_archives_exp = re.compile(r'list\.etsi\.org')
+ieee_archives_exp = re.compile(r'listserv\.ieee\.org')
 
 mailing_list_path_expressions = [gz_exp, ietf_ml_exp, txt_exp]
 
@@ -93,11 +96,12 @@ def load_data(
 
 
 def collect_from_url(
-    url: str, archive_dir: str = CONFIG.mail_path, notes=None
+    url: Union[list, str], archive_dir: str = CONFIG.mail_path, notes=None
 ):
     """Collect data from a given url."""
 
-    url = url.rstrip()
+    if isinstance(url, str):
+        url = url.rstrip()
     try:
         has_archives = collect_archive_from_url(
             url, archive_dir=archive_dir, notes=notes
@@ -160,8 +164,13 @@ def collect_from_file(
 ):
     """Collect urls from a file."""
     urls = urls_to_collect(urls_file)
-    for url in urls:
-        collect_from_url(url, archive_dir=archive_dir, notes=notes)
+    if tgpp_archives_exp.search(urls[0]):
+        collect_from_url(urls, archive_dir=archive_dir, notes=notes)
+    elif ieee_archives_exp.search(urls[0]):
+        collect_from_url(urls, archive_dir=archive_dir, notes=notes)
+    else:
+        for url in urls:
+            collect_from_url(url, archive_dir=archive_dir, notes=notes)
 
 
 def get_list_name(url):
@@ -273,24 +282,41 @@ def update_provenance(directory, provenance):
     file_handle.close()
 
 
-def collect_archive_from_url(url, archive_dir=CONFIG.mail_path, notes=None):
+def collect_archive_from_url(
+        url: Union[list, str], archive_dir=CONFIG.mail_path, notes=None,
+):
     """
     Collect archives (generally tar.gz) files from mailmain archive page.
 
     Return True if archives were downloaded, False otherwise
     (for example if the page lists no accessible archive files).
     """
-    list_name = get_list_name(url)
-    logging.info("Getting archive page for %s", list_name)
+    if isinstance(url, str):
+        list_name = get_list_name(url)
+        logging.info("Getting archive page for %s", list_name)
+    elif isinstance(url, list):
+        urls = url
+        url = url[0]
+        url_root = "https://" + urlparse(url).hostname
 
     if w3c_archives_exp.search(url):
         return w3crawl.collect_from_url(url, archive_dir, notes=notes)
-    elif listserv_archives_exp.search(url):
-        listserv.ListservArchive.from_url(
+    elif tgpp_archives_exp.search(url):
+        listserv.ListservArchive.from_mailing_lists(
             name="3GPP",
             url_root=url,
             url_home=url + "HOME",
-            instant_dump=True,
+            instant_save=True,
+            only_mlist_urls=False,
+        )
+    elif ieee_archives_exp.search(url):
+        listserv.ListservArchive.from_mailing_lists(
+            name="IEEE",
+            url_root=url_root,
+            url_mailing_lists=urls,
+            login={'username': '...', 'password': '...'},
+            only_mlist_urls=False,
+            instant_save=True,
         )
 
     response = urllib.request.urlopen(url)

From f9c01eb2525cdd0517b8c99dbed85866786d1e09 Mon Sep 17 00:00:00 2001
From: Christovis <christoph.becker@durham.ac.uk>
Date: Mon, 3 May 2021 23:22:21 +0100
Subject: [PATCH 3/4] fixed file paths

---
 bigbang/listserv.py | 21 ++++++++++++++++-----
 bigbang/mailman.py  | 11 ++++++-----
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/bigbang/listserv.py b/bigbang/listserv.py
index 58b8efcc..240be3f1 100644
--- a/bigbang/listserv.py
+++ b/bigbang/listserv.py
@@ -834,7 +834,7 @@ def to_mbox(self, dir_out: str, filename: Optional[str] = None):
             filepath = f"{dir_out}/{self.name}.mbox"
         else:
             filepath = f"{dir_out}/{filename}.mbox"
-        logger.info(f"The list {self.name} is save at {filepath}.")
+        logger.info(f"The list {self.name} is saved at {filepath}.")
         first = True
         for msg in self.messages:
             if first:
@@ -976,16 +976,22 @@ def from_mailing_lists(
             if session is None:
                 session = get_auth_session(url_login, **login)
             lists = []
-            for idx, url in enumerate(url_mailing_lists):
+            for url in url_mailing_lists:
+                mlist_name = url.split('A0=')[-1]
                 mlist = ListservList.from_url(
-                    name=idx,
+                    name=mlist_name,
                     url=url,
                     select=select,
                     session=session,
                 )
                 if len(mlist) != 0:
                     if instant_save:
-                        mlist.to_mbox(dir_out=CONFIG.mail_path)
+                        dir_out = CONFIG.mail_path + name
+                        try:
+                            os.mkdir(dir_out)
+                        except FileExistsError:
+                            pass  # temporary directory already exists, that's cool
+                        mlist.to_mbox(dir_out=dir_out)
                     else:
                         logger.info(f"Recorded the list {mlist.name}.")
                         lists.append(mlist)
@@ -1001,7 +1007,7 @@ def from_listserv_directory(
         folderdsc: str = "*",
         filedsc: str = "*.LOG?????",
         select: Optional[dict] = None,
-    ) -> "ListservList":
+    ) -> "ListservArchive":
         """
         Args:
             name: Name of the archive, e.g. '3GPP'.
@@ -1080,6 +1086,11 @@ def get_lists_from_url(
                     )
                     if len(mlist) != 0:
                         if instant_save:
+                            dir_out = CONFIG.mail_path + name
+                            try:
+                                os.mkdir(dir_out)
+                            except FileExistsError:
+                                pass  # temporary directory already exists, that's cool
                             mlist.to_mbox(dir_out=CONFIG.mail_path)
                             archive.append(mlist.name)
                         else:
diff --git a/bigbang/mailman.py b/bigbang/mailman.py
index 528e385f..c31de38d 100644
--- a/bigbang/mailman.py
+++ b/bigbang/mailman.py
@@ -302,15 +302,16 @@ def collect_archive_from_url(
     if w3c_archives_exp.search(url):
         return w3crawl.collect_from_url(url, archive_dir, notes=notes)
     elif tgpp_archives_exp.search(url):
-        listserv.ListservArchive.from_mailing_lists(
+        return listserv.ListservArchive.from_mailing_lists(
             name="3GPP",
-            url_root=url,
-            url_home=url + "HOME",
-            instant_save=True,
+            url_root=url_root,
+            url_mailing_lists=urls,
+            login={'username': '...', 'password': '...'},
             only_mlist_urls=False,
+            instant_save=True,
         )
     elif ieee_archives_exp.search(url):
-        listserv.ListservArchive.from_mailing_lists(
+        return listserv.ListservArchive.from_mailing_lists(
             name="IEEE",
             url_root=url_root,
             url_mailing_lists=urls,

From aed908fc70b3923aa07efd522497d848b6aeccd8 Mon Sep 17 00:00:00 2001
From: Christovis <christoph.becker@durham.ac.uk>
Date: Tue, 4 May 2021 08:27:23 +0100
Subject: [PATCH 4/4] use os.path.isdir instead of catching error

---
 bigbang/listserv.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/bigbang/listserv.py b/bigbang/listserv.py
index 240be3f1..4e851c14 100644
--- a/bigbang/listserv.py
+++ b/bigbang/listserv.py
@@ -987,10 +987,8 @@ def from_mailing_lists(
                 if len(mlist) != 0:
                     if instant_save:
                         dir_out = CONFIG.mail_path + name
-                        try:
+                        if os.path.isdir(dir_out) is False:
                             os.mkdir(dir_out)
-                        except FileExistsError:
-                            pass  # temporary directory already exists, that's cool
                         mlist.to_mbox(dir_out=dir_out)
                     else:
                         logger.info(f"Recorded the list {mlist.name}.")
@@ -1087,10 +1085,8 @@ def get_lists_from_url(
                     if len(mlist) != 0:
                         if instant_save:
                             dir_out = CONFIG.mail_path + name
-                            try:
+                            if os.path.isdir(dir_out) is False:
                                 os.mkdir(dir_out)
-                            except FileExistsError:
-                                pass  # temporary directory already exists, that's cool
                             mlist.to_mbox(dir_out=CONFIG.mail_path)
                             archive.append(mlist.name)
                         else: