refactoring some files

sajastu · Sep 7, 2022 · 796c71e · 796c71e
1 parent db84e4f
commit 796c71e
Showing 4 changed files with 4 additions and 126 deletions.
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ Each instance of TLDR9+ and TLDRHQ in the dataset has the following attributes:
 
 * `id`: The ID of the reddit post,
 * `document`: User's post text (source),
-* `summary`: Summary/TL;DR of the user's post,
+* `summary`: User-written summary/TL;DR of the post,
 * `ext_labels`: Extractive labels of the post's sentences.
 * `rg_labels`: The rouge scores of the post's sentences.
 
@@ -55,7 +55,7 @@ Some notes about the implementation are outlined below:
   - `-read_dir`: the input directory, where all uncompressed files Reddit files are located in.  
   - `-write_dir`: the output directory to write the filtered instances one-by-one.
   - `-tldr_th`: word threshold for filtering TL;DRs. The instances that do not pass this threshold will be dropped.
-  - `-lower`: a flag indicating either cased or uncased instances should be mined.
+  - `-lower`: a flag indicating either the crawled instances should be cased or uncased.
 
 ## Citation
 
@@ -78,4 +78,4 @@ If you intend to use the data or code provided in this repo, please cite the fol
 ````
 
 ## Contact
-Please contact Sajad Sotudeh ( `{firstname}@ir.cs.georgetown.edu` ) in case you have any question(s).
+Please contact [Sajad Sotudeh](mailto:sajad@ir.cs.georgetown.edu?subject=[TLDR9]%20Dataset%20Question) in case you have any question(s).
diff --git a/Reddit.py → dispatcher.py b/Reddit.py → dispatcher.py
@@ -1,5 +1,5 @@
 
-class Reddit():
+class Dispatcher():
     def __init__(self, args):
         self.args = args
 

diff --git a/fetch_file_links.py b/fetch_file_links.py
@@ -10,118 +10,6 @@
 from tqdm import tqdm
 
 
-def validate_date(date_text, raise_error=False):
-    try:
-        datetime.datetime.strptime(date_text, '%Y-%m')
-        return True
-    except ValueError:
-
-        try:
-            datetime.datetime.strptime(date_text, '%Y-%m-%d')
-            return True
-
-        except:
-
-            if raise_error:
-                raise ValueError("Incorrect data format, should be YYYY-MM")
-            else:
-                return False
-
-
-def uncompress_file(file_path):
-
-    def extract_zst(archive):
-        input_file = pathlib.Path(archive)
-        with open(input_file, 'rb') as compressed:
-            output_path = pathlib.Path(store_dir) / input_file.stem
-            try:
-                with open(output_path, 'wb') as of:
-                    decompressor = zstandard.ZstdDecompressor()
-                    decompressor.copy_stream(compressed, of)
-                print(f'{input_file} is unextracted...')
-            except Exception:
-                print(f'{input_file} is NOT extracted...')
-                print(traceback.format_exc())
-                os.remove(output_path)
-
-    def extract_bz(archive):
-        try:
-            with bz2.BZ2File(archive) as fr, open(archive.replace('.bz2', ''), "wb") as fw:
-                shutil.copyfileobj(fr, fw)
-        except:
-            os.remove(archive.replace('.bz2', ''))
-
-    def extract_xz(archive):
-        # fr = lzma.open(archive).read()
-        # with open(archive.replace('.xz', ''), "wb") as fw:
-        #     import pdb;pdb.set_trace()
-        try:
-            fr = lzma.open(archive).read()
-            with open(archive.replace('.xz', ''), "wb") as fw:
-                fw.write(fr)
-        except:
-            os.remove(archive.replace('.xz', ''))
-
-    store_dir = '/'.join(file_path.split('/')[:-1]).replace('comments', 'comments-uncompressed')
-
-    try:
-        if '.zst' in file_path:
-            extract_zst(file_path)
-        if '.bz2' in file_path:
-            extract_bz(file_path)
-        if '.xz' in file_path:
-            extract_xz(file_path)
-
-        print(f'{file_path} is uncompressed successfully.')
-        with open('uncompressed_files.txt', mode='a') as fW:
-            fW.write(file_path.split('/')[-1])
-            fW.write('\n')
-
-        # os.remove(file_path)
-
-    except:
-        with open('corrupted-uncompressed-comments.txt', mode='a') as F:
-            F.write(file_path)
-            F.write('\n')
-            # os.remove(file_path)
-
-
-def download_zip(url, filePath):
-    try:
-        def dl_zip(url):
-            response = requests.get(url, stream=True)
-            total_size_in_bytes = int(response.headers.get('content-length', 0))
-            block_size = 1024
-            progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
-            with open(filePath + file_name, 'wb') as file:
-                for data in response.iter_content(block_size):
-                    progress_bar.update(len(data))
-                    file.write(data)
-            progress_bar.close()
-            if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
-                with open('corrupted-download-comments.txt', mode='a') as F:
-                    F.write(url)
-                    F.write('\n')
-                print("ERROR, something went wrong")
-
-        file_name = url.split("/")[-1]
-        stored_file = filePath + file_name
-        # print(f"Downloading started for {url} and will be saved to {stored_file}")
-        dl_zip(url)
-        print(" Downloaded {} ".format(url))
-        return stored_file
-        # uncompress_file(stored_file)
-        # print(" Extracted {}".format(filePath + '/' + file_name))
-        # os.remove(stored_file)
-
-    except Exception as e:
-        with open('corrupted-download.txt', mode='a') as F:
-            F.write(url)
-            F.write('\n')
-        print("ERROR, something went wrong")
-        print(e)
-
-
 def fetch_links(homepage, start_date='2015-06', end_date='2099-12'):
 
     url = urllib.request.urlopen(homepage)

diff --git a/reddit_cm_aggregator.py b/reddit_cm_aggregator.py
@@ -54,16 +54,6 @@ def _run(self):
                 bulk_files.append(bulk_file)
 
         print(f'Processing {len(bulk_files)} bulk files started...')
-        ctr_processed = 0
-        ctr_processed_written = 0
-
-        # for f in bulk_files:
-        #     local_id_to_file_mapping = self._process_bulk(f)
-        #     combined_output = [self.comment_dict, local_id_to_file_mapping]
-        #
-        #     self.comment_dict = reduce(reducer, combined_output, {})
-        #
-        #     import pdb;pdb.set_trace()
 
         pool = Pool(self.args.n_cpus)
         for local_id_to_file_mapping in tqdm(pool.imap_unordered(self._process_bulk, bulk_files), total=len(bulk_files)):