internetarchive · mekarpeles · Sep 25, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 25, 2024
diff --git a/compose.production.yaml b/compose.production.yaml
@@ -73,6 +73,8 @@ services:
       - ../olsystem:/olsystem
       - /1:/1
     deploy:
+      # Note: the replicas here must be kept in sync with the `upstream covers_backend`
+      # value in `docker/covers_nginx.conf`.
       replicas: 2
 
   covers_nginx:

diff --git a/docker/covers_nginx.conf b/docker/covers_nginx.conf
@@ -16,6 +16,15 @@ server {
     ssl_prefer_server_ciphers on;
 }
 
+# Docker's internal load balancing ends up with unbalanced connections eventually.
+# This must be kept in sync with the `replicas` value in `compose.production.yaml`
+# for the `covers` service.
+upstream covers_backend {
+  least_conn;
+  server openlibrary-covers-1:7075;
+  server openlibrary-covers-2:7075;
+}
+
 server {
       listen 80;
       listen 443;
@@ -25,8 +34,15 @@ server {
 
       keepalive_timeout 5;
 
+      # Return 429 errors as JSON.
+      error_page 429 = @429;
+      location @429 {
+        default_type application/json;
+        return 429 '{"status": 429, "message": "Too Many Requests. Please email us at info@archive.org"}';
+      }
+
       location / {
-        proxy_pass http://covers:7075;
+        proxy_pass http://covers_backend;
         proxy_set_header Host $http_host;
 
         # Gunicorn takes IP from this header
@@ -37,8 +53,17 @@ server {
         proxy_set_header X-Scheme $scheme;
 
         if ($http_user_agent ~ (Bytespider) ) {
-           return 429;
+           return 444;
         }
+
+        if ($http_user_agent ~ (CloudFront) ) {
+           return 444;
+        }
+
+
+        # Covers rate limit.
+        limit_req zone=cover_limit burst=400 nodelay;
+        limit_req_status 429;
       }
 
       location ^~ /.well-known/acme-challenge/ {

diff --git a/docker/nginx.conf b/docker/nginx.conf
@@ -11,7 +11,7 @@ error_log  /var/log/nginx/error.log;
 pid        /var/run/nginx.pid;
 
 events {
-    worker_connections  1024;
+    worker_connections  2048;
     # multi_accept on;
 }
 
@@ -44,6 +44,25 @@ http {
     # Black-listed IPs
     include /olsystem/etc/nginx/deny.conf;
 
+    # Rate limiting: https://nginx.org/en/docs/http/ngx_http_limit_req_module.html
+    # No rate limit when IP obfuscation is not applied, as every IP is 255.0.0.0.
+    # These rules only do anything if invoked, e.g., in web_nginx.conf.
+    # TLDR: these rules can be disabled in `docker/web_nginx.conf`
+    # and `docker/covers_nginx.conf`.
+    geo $should_apply_limit {
+      255.0.0.0 0;
+      default 1;
+    }
+
+    map $should_apply_limit $rate_limit_key {
+      0 '';
+      1 $binary_remote_addr;
+    }
+
+    limit_req_zone $rate_limit_key zone=web_limit:10m rate=200r/m;
+    # Set a more permissive limit for covers because some pages might load 20+ covers.
+    limit_req_zone $rate_limit_key zone=cover_limit:10m rate=400r/m;
+
     # Things are mounted into here by the docker compose file
     include /etc/nginx/sites-enabled/*;
 }
diff --git a/docker/web_nginx.conf b/docker/web_nginx.conf
@@ -64,6 +64,14 @@ server {
     if ($api_call = "http:noapi") {
         rewrite ^(.*)$ https://$http_host$1 last;
     }
+
+    # Return 429 errors as JSON.
+    error_page 429 = @429;
+    location @429 {
+      default_type application/json;
+      return 429 '{"status": 429, "message": "Too Many Requests. Consider using https://openlibrary.org/developers/dumps."}';
+    }
+
     location / {
         proxy_pass http://webnodes;
         proxy_set_header Host $http_host;
@@ -76,8 +84,13 @@ server {
         proxy_set_header X-Scheme $scheme;
 
         if ($http_user_agent ~ (Bytespider) ) {
-           return 429;
+           return 444;
         }
+
+
+        # Web rate limit.
+        limit_req zone=web_limit burst=200 nodelay;
+        limit_req_status 429;
     }
 
     location ^~ /.well-known/acme-challenge/ {