Skip to content

Commit

Permalink
refactoring some files
Browse files Browse the repository at this point in the history
  • Loading branch information
sajad committed Sep 7, 2022
1 parent db84e4f commit 796c71e
Showing 4 changed files with 4 additions and 126 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -17,7 +17,7 @@ Each instance of TLDR9+ and TLDRHQ in the dataset has the following attributes:

* `id`: The ID of the reddit post,
* `document`: User's post text (source),
* `summary`: Summary/TL;DR of the user's post,
* `summary`: User-written summary/TL;DR of the post,
* `ext_labels`: Extractive labels of the post's sentences.
* `rg_labels`: The rouge scores of the post's sentences.

@@ -55,7 +55,7 @@ Some notes about the implementation are outlined below:
- `-read_dir`: the input directory, where all uncompressed files Reddit files are located in.
- `-write_dir`: the output directory to write the filtered instances one-by-one.
- `-tldr_th`: word threshold for filtering TL;DRs. The instances that do not pass this threshold will be dropped.
- `-lower`: a flag indicating either cased or uncased instances should be mined.
- `-lower`: a flag indicating either the crawled instances should be cased or uncased.

## Citation

@@ -78,4 +78,4 @@ If you intend to use the data or code provided in this repo, please cite the fol
````

## Contact
Please contact Sajad Sotudeh ( `{firstname}@ir.cs.georgetown.edu` ) in case you have any question(s).
Please contact [Sajad Sotudeh](mailto:sajad@ir.cs.georgetown.edu?subject=[TLDR9]%20Dataset%20Question) in case you have any question(s).
2 changes: 1 addition & 1 deletion Reddit.py → dispatcher.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

class Reddit():
class Dispatcher():
def __init__(self, args):
self.args = args

112 changes: 0 additions & 112 deletions fetch_file_links.py
Original file line number Diff line number Diff line change
@@ -10,118 +10,6 @@
from tqdm import tqdm


def validate_date(date_text, raise_error=False):
try:
datetime.datetime.strptime(date_text, '%Y-%m')
return True
except ValueError:

try:
datetime.datetime.strptime(date_text, '%Y-%m-%d')
return True

except:

if raise_error:
raise ValueError("Incorrect data format, should be YYYY-MM")
else:
return False


def uncompress_file(file_path):

def extract_zst(archive):
input_file = pathlib.Path(archive)
with open(input_file, 'rb') as compressed:
output_path = pathlib.Path(store_dir) / input_file.stem
try:
with open(output_path, 'wb') as of:
decompressor = zstandard.ZstdDecompressor()
decompressor.copy_stream(compressed, of)
print(f'{input_file} is unextracted...')
except Exception:
print(f'{input_file} is NOT extracted...')
print(traceback.format_exc())
os.remove(output_path)

def extract_bz(archive):
try:
with bz2.BZ2File(archive) as fr, open(archive.replace('.bz2', ''), "wb") as fw:
shutil.copyfileobj(fr, fw)
except:
os.remove(archive.replace('.bz2', ''))

def extract_xz(archive):
# fr = lzma.open(archive).read()
# with open(archive.replace('.xz', ''), "wb") as fw:
# import pdb;pdb.set_trace()
try:
fr = lzma.open(archive).read()
with open(archive.replace('.xz', ''), "wb") as fw:
fw.write(fr)
except:
os.remove(archive.replace('.xz', ''))

store_dir = '/'.join(file_path.split('/')[:-1]).replace('comments', 'comments-uncompressed')

try:
if '.zst' in file_path:
extract_zst(file_path)
if '.bz2' in file_path:
extract_bz(file_path)
if '.xz' in file_path:
extract_xz(file_path)

print(f'{file_path} is uncompressed successfully.')
with open('uncompressed_files.txt', mode='a') as fW:
fW.write(file_path.split('/')[-1])
fW.write('\n')

# os.remove(file_path)

except:
with open('corrupted-uncompressed-comments.txt', mode='a') as F:
F.write(file_path)
F.write('\n')
# os.remove(file_path)


def download_zip(url, filePath):
try:
def dl_zip(url):
response = requests.get(url, stream=True)
total_size_in_bytes = int(response.headers.get('content-length', 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
with open(filePath + file_name, 'wb') as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
with open('corrupted-download-comments.txt', mode='a') as F:
F.write(url)
F.write('\n')
print("ERROR, something went wrong")

file_name = url.split("/")[-1]
stored_file = filePath + file_name
# print(f"Downloading started for {url} and will be saved to {stored_file}")
dl_zip(url)
print(" Downloaded {} ".format(url))
return stored_file
# uncompress_file(stored_file)
# print(" Extracted {}".format(filePath + '/' + file_name))
# os.remove(stored_file)

except Exception as e:
with open('corrupted-download.txt', mode='a') as F:
F.write(url)
F.write('\n')
print("ERROR, something went wrong")
print(e)


def fetch_links(homepage, start_date='2015-06', end_date='2099-12'):

url = urllib.request.urlopen(homepage)
10 changes: 0 additions & 10 deletions reddit_cm_aggregator.py
Original file line number Diff line number Diff line change
@@ -54,16 +54,6 @@ def _run(self):
bulk_files.append(bulk_file)

print(f'Processing {len(bulk_files)} bulk files started...')
ctr_processed = 0
ctr_processed_written = 0

# for f in bulk_files:
# local_id_to_file_mapping = self._process_bulk(f)
# combined_output = [self.comment_dict, local_id_to_file_mapping]
#
# self.comment_dict = reduce(reducer, combined_output, {})
#
# import pdb;pdb.set_trace()

pool = Pool(self.args.n_cpus)
for local_id_to_file_mapping in tqdm(pool.imap_unordered(self._process_bulk, bulk_files), total=len(bulk_files)):

0 comments on commit 796c71e

Please sign in to comment.