Skip to content

Commit

Permalink
chg: [crawler] add option to controls whether the crawler should proc…
Browse files Browse the repository at this point in the history
…eed with crawling onion domains that have not yet been classified as safe or unsafe.
  • Loading branch information
Terrtia committed Feb 6, 2025
1 parent f01cfe7 commit f7964fb
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 5 deletions.
6 changes: 4 additions & 2 deletions bin/crawlers/Crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def __init__(self):
config_loader = ConfigLoader()

self.filter_unsafe_onion = crawlers.is_onion_filter_enabled(cache=False)
self.filter_unknown_onion = crawlers.is_onion_filter_unknown(cache=False)
self.last_config_check = int(time.time())

self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
Expand Down Expand Up @@ -145,6 +146,7 @@ def get_message(self):
# Refresh Config
if int(time.time()) - self.last_config_check > 60:
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled()
self.filter_unknown_onion = crawlers.is_onion_filter_unknown()
self.last_config_check = int(time.time())

# Check if a new Capture can be Launched
Expand All @@ -156,7 +158,7 @@ def get_message(self):
if self.filter_unsafe_onion:
if domain.endswith('.onion'):
try:
if not crawlers.check_if_onion_is_safe(domain):
if not crawlers.check_if_onion_is_safe(domain, unknown=self.filter_unknown_onion):
# print('---------------------------------------------------------')
# print('DOMAIN FILTERED')
task.delete()
Expand Down Expand Up @@ -388,7 +390,7 @@ def save_capture_response(self, parent_id, entries):
# Filter Domain
if self.filter_unsafe_onion:
if current_domain.endswith('.onion'):
if not crawlers.check_if_onion_is_safe(current_domain):
if not crawlers.check_if_onion_is_safe(current_domain, unknown=self.filter_unknown_onion):
return False

# TODO LAST URL
Expand Down
41 changes: 40 additions & 1 deletion bin/lib/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2296,7 +2296,7 @@ def _onion_lookup(onion_url):
return {'error': f'Timeout Error'}


def check_if_onion_is_safe(onion_url):
def check_if_onion_is_safe(onion_url, unknown):
resp = _onion_lookup(onion_url)
if resp:
if isinstance(resp, dict):
Expand All @@ -2305,6 +2305,11 @@ def check_if_onion_is_safe(onion_url):
elif 'error' in resp:
if resp['error']:
raise OnionFilteringError(resp['error'])
elif not unknown:
if isinstance(resp, list):
if len(resp) > 1:
if resp[1] == 404:
return True
return False


Expand Down Expand Up @@ -2351,6 +2356,40 @@ def change_onion_filter_state(new_state):
return True
return False

# # Crawl Unknown Onion # #
def _is_onion_filter_unknown():
unknown = r_crawler.hget('crawler:onion_filter', 'unknown')
if unknown is None:
r_crawler.hset('crawler:onion_filter', 'unknown', str(False))
filter_enabled = False
else:
filter_enabled = unknown == 'True'
r_cache.set('crawler:onion_filter:unknown', str(filter_enabled))
return filter_enabled

def is_onion_filter_unknown(cache=True):
if cache:
res = r_cache.get('crawler:onion_filter:unknown')
if res is None:
unknown = _is_onion_filter_unknown()
r_cache.set('crawler:onion_filter:unknown', str(unknown))
return unknown
else:
return res == 'True'
else:
return _is_onion_filter_unknown()

def change_onion_filter_unknown_state(new_state):
old_state = is_onion_filter_unknown(cache=False)
if old_state != new_state:
r_crawler.hset('crawler:onion_filter', 'unknown', str(new_state))
r_cache.set('crawler:onion_filter:unknown', str(new_state))
update_time = time.time()
r_crawler.hset('crawler:onion_filter', 'update_time', update_time)
r_cache.set('crawler:onion_filter:last_update_time', update_time)
return True
return False

#### ---- ####


Expand Down
15 changes: 14 additions & 1 deletion var/www/blueprints/crawler_splash.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,7 @@ def crawler_settings():
crawler_error_mess = crawlers.get_test_ail_crawlers_message()

is_onion_filter_enabled = crawlers.is_onion_filter_enabled(cache=False)
is_onion_filter_unknown = crawlers.is_onion_filter_unknown(cache=False)

# TODO REGISTER PROXY
# all_proxies = crawlers.get_all_proxies_metadata()
Expand All @@ -1011,6 +1012,7 @@ def crawler_settings():
is_crawler_working=is_crawler_working,
crawler_error_mess=crawler_error_mess,
is_onion_filter_enabled=is_onion_filter_enabled,
is_onion_filter_unknown=is_onion_filter_unknown
)


Expand Down Expand Up @@ -1066,9 +1068,20 @@ def crawler_filter_unsafe_onion():
filter_unsafe_onion = True
else:
filter_unsafe_onion = False
print(filter_unsafe_onion)
crawlers.change_onion_filter_state(filter_unsafe_onion)
return redirect(url_for('crawler_splash.crawler_settings'))

@crawler_splash.route('/crawler/settings/crawler/filter_unknown_onion', methods=['GET'])
@login_required
@login_admin
def crawler_filter_unknown_onion():
filter_unknown_onion = request.args.get('state')
if filter_unknown_onion == 'enable':
filter_unknown_onion = True
else:
filter_unknown_onion = False
crawlers.change_onion_filter_unknown_state(filter_unknown_onion)
return redirect(url_for('crawler_splash.crawler_settings'))


# --- LACUS ---#
30 changes: 29 additions & 1 deletion var/www/templates/crawler/crawler_splash/settings_crawler.html
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ <h5 class="card-title">
</p>
{% if is_onion_filter_enabled %}
<a href="{{ url_for('crawler_splash.crawler_filter_unsafe_onion') }}?state=disable">
<button class="btn btn-danger mx-4 my-2">
<button class="btn btn-danger my-2">
<i class="fa-solid fa-xmark"></i> Disable Onion Filter
</button>
</a>
Expand All @@ -254,6 +254,34 @@ <h5 class="card-title">
</button>
</a>
{% endif %}

<hr class="border-1 my-4">

<h5 class="card-title">
Crawl Unknown Onion: &nbsp;&nbsp;<b class="text-primary"><span class="text-{% if is_onion_filter_unknown %}success{% else %}secondary{% endif %}">{% if is_onion_filter_unknown %}Enabled{% else %}Disabled{% endif %}</span></b>
</h5>
<p>This option controls whether the crawler should proceed with crawling onion domains that have <strong>not yet been classified</strong> as safe or unsafe.</p>

<ul>
<li><strong>If disabled:</strong> The crawler will process domains that have never been checked, potentially discovering new useful content but also increasing the risk of encountering unsafe materials.</li>
<li><strong>If enabled:</strong> The crawler will only process domains that have been explicitly identified as safe, reducing risk but potentially missing new, unclassified domains.</li>
</ul>

<p>This option is useful for users who want to explore uncharted domains while still benefiting from the <code>filter_unsafe_onion</code> protection. However, enabling this option increases the likelihood of encountering harmful content, so caution is advised.</p>
{% if is_onion_filter_unknown %}
<a href="{{ url_for('crawler_splash.crawler_filter_unknown_onion') }}?state=disable">
<button class="btn btn-secondary my-2">
<i class="fa-solid fa-xmark"></i> Disable Unknown Onion Filter
</button>
</a>
{% else %}
<a href="{{ url_for('crawler_splash.crawler_filter_unknown_onion') }}?state=enable">
<button class="btn btn-info my-2">
<i class="fa-solid fa-check"></i> Enable Unknown Onion Filter
</button>
</a>
{% endif %}

</div>
</div>

Expand Down

0 comments on commit f7964fb

Please sign in to comment.