From ba0c482af2c6537ed430231868e704efcbcb2254 Mon Sep 17 00:00:00 2001 From: Ryan Date: Mon, 18 Mar 2024 09:14:54 -0700 Subject: [PATCH] Add try/except around urllib parsing. Modify the HSTS domain parsing to account for invalid URL forms. --- pyhindsight/browsers/chrome.py | 38 +++++++++++++++++++++--------- pyhindsight/browsers/webbrowser.py | 6 ++++- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/pyhindsight/browsers/chrome.py b/pyhindsight/browsers/chrome.py index fdaba04..41cd5b4 100644 --- a/pyhindsight/browsers/chrome.py +++ b/pyhindsight/browsers/chrome.py @@ -2203,21 +2203,37 @@ def get_site_characteristics(self, path, dir_name): def build_hsts_domain_hashes(self): domains = set() for artifact in self.parsed_artifacts: - if isinstance(artifact, self.HistoryItem): - artifact_url = artifact.url + if not isinstance(artifact, self.HistoryItem): + continue - if not artifact_url: - continue + if not artifact.url: + continue + + artifact_url = artifact.url + + # Some artifact "URLs" will be in invalid forms, which urllib (rightly) + # won't parse. Modify these URLs so they will parse properly. + # Examples: + # Cookie: ".example.com", + # Preferences (cookie_controls_metadata): "https://[*.]example.com" + prefixes = ('.', 'https://[*.]', 'http://[*.]') - # Cookie artifact's "URLs" will be in the form ".example.com", - # which won't parse, so modify it so it will - if artifact_url and artifact_url.startswith('.'): - artifact_url = 'http://' + artifact_url[1:] + for prefix in prefixes: + if artifact_url.startswith(prefix): + artifact_url = 'http://' + artifact_url[len(prefix):] + if artifact_url.endswith(',*'): + artifact_url = artifact_url[:-2] + + try: domain = urllib.parse.urlparse(artifact_url).hostname - # Some URLs don't have a domain, like local PDF files - if domain: - domains.add(domain) + except ValueError as e: + log.warning(f'Error when parsing domain from {artifact_url}; {e}') + continue + + # Some URLs don't have a domain, like local PDF files + if domain: + domains.add(domain) for domain in domains: diff --git a/pyhindsight/browsers/webbrowser.py b/pyhindsight/browsers/webbrowser.py index 06acab7..560e979 100644 --- a/pyhindsight/browsers/webbrowser.py +++ b/pyhindsight/browsers/webbrowser.py @@ -114,7 +114,11 @@ def dict_factory(cursor, row): def build_md5_hash_list_of_origins(self): for artifact in self.parsed_artifacts: if isinstance(artifact, self.HistoryItem): - domain = urllib.parse.urlparse(artifact.url).hostname + try: + domain = urllib.parse.urlparse(artifact.url).hostname + except ValueError as e: + log.warning(f'Error when parsing domain from {artifact.url}; {e}') + continue # Some URLs don't have a domain, like local PDF files if domain: self.origin_hashes[hashlib.md5(domain.encode()).hexdigest()] = domain