Skip to content

Commit

Permalink
Create parse_privacy_policy_TG.py
Browse files Browse the repository at this point in the history
  • Loading branch information
spookyahell authored Sep 5, 2024
1 parent dc84759 commit 593fa0f
Showing 1 changed file with 127 additions and 0 deletions.
127 changes: 127 additions & 0 deletions parse_privacy_policy_TG.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
''You'll need to download a copy from somewhere and give it a path via a seperate file for now since downloading a new copy isn't handled'''

import json, re
from os import sep
from parse_pp_config import path
from bs4 import BeautifulSoup

path_to_html = sep.join([path, 'Telegram Privacy Policy.html'])

print('Loading document from: {0!r}'.format(path_to_html))

# Make sure UTF-8 BOM is properly stored.
with open(path_to_html, encoding = 'utf-8-sig') as f:
html_code = f.read()



soup = BeautifulSoup(html_code, 'html.parser')
elements = soup.find('div', {'id':'dev_page_content'})
object = {}

html_lines = html_code.split('\n')

found_it = False
for line in html_lines:
if ' <td class="c" id="displayMonthEl" title="You are here: ' in line:
found_it = True
break

def insert_on(data = "21:07:30 Sep 05, 2024"):
# Only while archive.orgs uses this time format (--> default input value of the function)
return data[:9]+ 'on ' + data[9:]


if found_it is True:
date = re.search(r' <td class="c" id="displayMonthEl" title="You are here: ([^"]+)">Sep</td>', line)
if date:
the_date = date.group(1)
print('This document was archived via archive.org.')
print('Archived:', insert_on(the_date))
print()

object['date_document_archived'] = the_date
object['archive_source'] = 'web.archive.org'

minor_heading = None; major_heading = None

for element in elements:
if element.name is None:
continue

if element.name == 'p':
#~ print(element.text)

find_strong = element.find('strong')
if not find_strong:
fst = ''
else:
fst = find_strong.text

if fst == element.text:
is_strong = True
else:
is_strong = False

if minor_heading is None:
if not major_heading is None:
if not 'text' in object[major_heading]:
object[major_heading]['text'] = []
object[major_heading]['text'].append({'text':element.text, 'isStrong': is_strong})
else:
if not 'text' in object[major_heading][minor_heading]:
object[major_heading][minor_heading]['text'] = []
object[major_heading][minor_heading]['text'].append({'text':element.text, 'isStrong': is_strong})

elif element.name == 'h3':
# Major heading
major_heading = element.text
minor_heading = None
object[major_heading] = {}
elif element.name == 'h4':
minor_heading = element.text
object[major_heading][minor_heading] = {}
elif element.name == 'ul':
items = []
for item in element:
if item.name is not None:
#~ print('- ', item.text)
items.append('- {0}'.format(item.text))

if minor_heading is None:
if not 'list' in object[major_heading]:
object[major_heading]['list'] = []
object[major_heading]['list'].append(items)
else:
if not 'list' in object[major_heading][minor_heading]:
object[major_heading][minor_heading]['list'] = []
object[major_heading][minor_heading]['list'].append(items)


for major_heading in object:
print(major_heading)

if 'Changes to this' in major_heading:
break

count_strong = 0
for text in object[major_heading]['text']:
if text.get('isStrong') is True:
count_strong += 1

if count_strong == len(object[major_heading]['list']):
object[major_heading]['changes'] = {}
count_up = 0
for idx, item in enumerate(object[major_heading]['text']):
the_text = object[major_heading]['text'][idx]['text']
if object[major_heading]['text'][idx]['isStrong'] is True:
object[major_heading]['changes'][the_text] = object[major_heading]['list'][count_up]
count_up += 1

del object[major_heading]['text']
del object[major_heading]['list']

#~ print(object)

with open('tg-privacy-policy.json', 'w') as f:
f.write(json.dumps(object, indent = 2))

0 comments on commit 593fa0f

Please sign in to comment.