From ba0c482af2c6537ed430231868e704efcbcb2254 Mon Sep 17 00:00:00 2001
From: Ryan <ryan@obsidianforensics.com>
Date: Mon, 18 Mar 2024 09:14:54 -0700
Subject: [PATCH] Add try/except around urllib parsing. Modify the HSTS domain
 parsing to account for invalid URL forms.

---
 pyhindsight/browsers/chrome.py     | 38 +++++++++++++++++++++---------
 pyhindsight/browsers/webbrowser.py |  6 ++++-
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/pyhindsight/browsers/chrome.py b/pyhindsight/browsers/chrome.py
index fdaba04..41cd5b4 100644
--- a/pyhindsight/browsers/chrome.py
+++ b/pyhindsight/browsers/chrome.py
@@ -2203,21 +2203,37 @@ def get_site_characteristics(self, path, dir_name):
     def build_hsts_domain_hashes(self):
         domains = set()
         for artifact in self.parsed_artifacts:
-            if isinstance(artifact, self.HistoryItem):
-                artifact_url = artifact.url
+            if not isinstance(artifact, self.HistoryItem):
+                continue
 
-                if not artifact_url:
-                    continue
+            if not artifact.url:
+                continue
+
+            artifact_url = artifact.url
+
+            # Some artifact "URLs" will be in invalid forms, which urllib (rightly)
+            # won't parse. Modify these URLs so they will parse properly.
+            # Examples:
+            #   Cookie: ".example.com",
+            #   Preferences (cookie_controls_metadata): "https://[*.]example.com"
+            prefixes = ('.', 'https://[*.]', 'http://[*.]')
 
-                # Cookie artifact's "URLs" will be in the form ".example.com",
-                # which won't parse, so modify it so it will
-                if artifact_url and artifact_url.startswith('.'):
-                    artifact_url = 'http://' + artifact_url[1:]
+            for prefix in prefixes:
+                if artifact_url.startswith(prefix):
+                    artifact_url = 'http://' + artifact_url[len(prefix):]
 
+            if artifact_url.endswith(',*'):
+                artifact_url = artifact_url[:-2]
+
+            try:
                 domain = urllib.parse.urlparse(artifact_url).hostname
-                # Some URLs don't have a domain, like local PDF files
-                if domain:
-                    domains.add(domain)
+            except ValueError as e:
+                log.warning(f'Error when parsing domain from {artifact_url}; {e}')
+                continue
+
+            # Some URLs don't have a domain, like local PDF files
+            if domain:
+                domains.add(domain)
 
         for domain in domains:
 
diff --git a/pyhindsight/browsers/webbrowser.py b/pyhindsight/browsers/webbrowser.py
index 06acab7..560e979 100644
--- a/pyhindsight/browsers/webbrowser.py
+++ b/pyhindsight/browsers/webbrowser.py
@@ -114,7 +114,11 @@ def dict_factory(cursor, row):
     def build_md5_hash_list_of_origins(self):
         for artifact in self.parsed_artifacts:
             if isinstance(artifact, self.HistoryItem):
-                domain = urllib.parse.urlparse(artifact.url).hostname
+                try:
+                    domain = urllib.parse.urlparse(artifact.url).hostname
+                except ValueError as e:
+                    log.warning(f'Error when parsing domain from {artifact.url}; {e}')
+                    continue
                 # Some URLs don't have a domain, like local PDF files
                 if domain:
                     self.origin_hashes[hashlib.md5(domain.encode()).hexdigest()] = domain