From ce3cb400cd190be3003b2e5a9dd33ce22a05b18a Mon Sep 17 00:00:00 2001
From: Simon <63975668+Simyon264@users.noreply.github.com>
Date: Thu, 26 Sep 2024 20:17:56 +0200
Subject: [PATCH 1/2] Ranged requests
---
ReplayBrowser/Helpers/HttpExtensions.cs | 25 +-
ReplayBrowser/Helpers/ZipDownloader.cs | 43 ++
ReplayBrowser/ReplayBrowser.csproj | 8 +
.../ReplayParser/ReplayParserService.cs | 52 ++-
Tools/unzip_http.py | 382 ++++++++++++++++++
5 files changed, 503 insertions(+), 7 deletions(-)
create mode 100644 ReplayBrowser/Helpers/ZipDownloader.cs
create mode 100644 Tools/unzip_http.py
diff --git a/ReplayBrowser/Helpers/HttpExtensions.cs b/ReplayBrowser/Helpers/HttpExtensions.cs
index 62f0fb7..a7c75d3 100644
--- a/ReplayBrowser/Helpers/HttpExtensions.cs
+++ b/ReplayBrowser/Helpers/HttpExtensions.cs
@@ -1,7 +1,30 @@
-namespace ReplayBrowser.Helpers;
+using System.IO.Compression;
+
+namespace ReplayBrowser.Helpers;
public static class HttpExtensions
{
+ ///
+ /// Checks if the server supports
+ ///
+ public static async Task SupportsRangeRequests(this HttpClient client, string requestUri)
+ {
+ var request = new HttpRequestMessage(HttpMethod.Head, requestUri);
+ var response = await client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead);
+ return response.Headers.AcceptRanges.Contains("bytes");
+ }
+
+ ///
+ /// Returns the size of the file in bytes
+ ///
+ public static async Task GetFileSizeAsync(this HttpClient client, string requestUri)
+ {
+ var request = new HttpRequestMessage(HttpMethod.Head, requestUri);
+ var response = await client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead);
+ var contentLength = response.Content.Headers.ContentLength;
+ return contentLength ?? -1;
+ }
+
public static async Task GetStreamAsync(this HttpClient client, string requestUri, IProgress progress, CancellationToken token)
{
var response = await client.GetAsync(requestUri, HttpCompletionOption.ResponseHeadersRead, token);
diff --git a/ReplayBrowser/Helpers/ZipDownloader.cs b/ReplayBrowser/Helpers/ZipDownloader.cs
new file mode 100644
index 0000000..1ed5e67
--- /dev/null
+++ b/ReplayBrowser/Helpers/ZipDownloader.cs
@@ -0,0 +1,43 @@
+
+using System.Diagnostics;
+using System.Text;
+
+namespace ReplayBrowser.Helpers;
+
+public static class ZipDownloader
+{
+ public static async Task> ExtractFilesFromZipAsync(string zipUrl, string[] filesToExtract)
+ {
+ // ok so i first tried doing this in c#, but then saw a python script "unzip_http" that does this, so now im just gonna call that
+
+ var files = new Dictionary();
+
+ foreach (var file in filesToExtract)
+ {
+ var process = new Process();
+ process.StartInfo.FileName = "python";
+ process.StartInfo.UseShellExecute = false;
+ process.StartInfo.RedirectStandardOutput = true;
+ process.StartInfo.RedirectStandardError = true;
+ var path = Path.Combine(AppDomain.CurrentDomain.BaseDirectory + "Tools/unzip_http.py");
+ process.StartInfo.Arguments = $"{path} -o {zipUrl} {file}";
+
+ process.Start();
+
+ var output = await process.StandardOutput.ReadToEndAsync();
+ var error = await process.StandardError.ReadToEndAsync();
+
+ await process.WaitForExitAsync();
+
+ if (process.ExitCode != 0)
+ {
+ throw new Exception($"Failed to extract files from zip: {error}");
+ }
+
+ var stream = new MemoryStream(Encoding.UTF8.GetBytes(output));
+ files.Add(file, stream);
+ }
+
+ return files;
+ }
+}
\ No newline at end of file
diff --git a/ReplayBrowser/ReplayBrowser.csproj b/ReplayBrowser/ReplayBrowser.csproj
index 964f030..fbe5de1 100644
--- a/ReplayBrowser/ReplayBrowser.csproj
+++ b/ReplayBrowser/ReplayBrowser.csproj
@@ -6,6 +6,8 @@
enable
Debug;Release;Testing
AnyCPU
+
+ $(MSBuildThisFileDirectory)..\
@@ -33,8 +35,14 @@
+
+
+
+
+
+
<_ContentIncludedByDefault Remove="wwwroot\lib\bootstrap\dist\css\bootstrap-grid.css" />
<_ContentIncludedByDefault Remove="wwwroot\lib\bootstrap\dist\css\bootstrap-grid.css.map" />
diff --git a/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs b/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs
index fa660d0..0b15f41 100644
--- a/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs
+++ b/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs
@@ -3,6 +3,7 @@
using System.IO.Compression;
using System.Text;
using Microsoft.EntityFrameworkCore;
+using Microsoft.Net.Http.Headers;
using ReplayBrowser.Data;
using ReplayBrowser.Data.Models;
using ReplayBrowser.Helpers;
@@ -34,6 +35,11 @@ public class ReplayParserService : IHostedService, IDisposable
private readonly IServiceScopeFactory _factory;
+ ///
+ /// In this case we wont just add it to the parsed replays, so it redownloads it every time.
+ ///
+ private const string YamlSerializerError = "Exception during deserialization";
+
public ReplayParserService(IConfiguration configuration, IServiceScopeFactory factory)
{
_configuration = configuration;
@@ -163,18 +169,47 @@ private async Task ConsumeQueue(CancellationToken token)
});
client.DefaultRequestHeaders.Add("User-Agent", "ReplayBrowser");
Log.Information("Downloading " + replay);
- var stream = await client.GetStreamAsync(replay, progress, token);
- completed++;
- Details = $"{completed}/{total}";
+ var fileSize = await client.GetFileSizeAsync(replay);
+ // Check if the server supports range requests.
+ var supportsRange = (await client.SupportsRangeRequests(replay) && fileSize != -1);
+
Replay? parsedReplay = null;
try
{
- parsedReplay = ParseReplay(stream, replay);
+ if (!supportsRange)
+ {
+ var stream = await client.GetStreamAsync(replay, progress, token);
+ completed++;
+ Details = $"{completed}/{total}";
+ parsedReplay = ParseReplay(stream, replay);
+ }
+ else
+ {
+ try
+ {
+ // The server supports ranged processing!
+ string[] files = ["_replay/replay_final.yml"];
+ var extractedFiles = await ZipDownloader.ExtractFilesFromZipAsync(replay, files);
+ completed++;
+ Details = $"{completed}/{total}";
+ parsedReplay = FinalizeReplayParse(new StreamReader(extractedFiles["_replay/replay_final.yml"]), replay);
+ }
+ catch (Exception e)
+ {
+ Log.Error(e, "Error while downloading " + replay);
+ if (e.Message.Contains(YamlSerializerError)) return;
+
+ await AddParsedReplayToDb(replay);
+ return;
+ }
+ }
}
catch (Exception e)
{
Log.Error(e, "Error while parsing " + replay);
- await AddParsedReplayToDb(replay); // Prevent circular download eating up all resources.
+ if (e.Message.Contains(YamlSerializerError)) return;
+
+ await AddParsedReplayToDb(replay);
return;
}
// See if the link matches the date regex, if it does set the date
@@ -259,12 +294,17 @@ private Replay ParseReplay(Stream stream, string replayLink)
var replayStream = replayFile.Open();
var reader = new StreamReader(replayStream);
+ return FinalizeReplayParse(reader, replayLink);
+ }
+
+ private Replay FinalizeReplayParse(StreamReader stream, string replayLink)
+ {
var deserializer = new DeserializerBuilder()
.IgnoreUnmatchedProperties()
.WithNamingConvention(CamelCaseNamingConvention.Instance)
.Build();
- var yamlReplay = deserializer.Deserialize(reader);
+ var yamlReplay = deserializer.Deserialize(stream);
if (yamlReplay.Map == null && yamlReplay.Maps == null)
{
throw new Exception("Replay is not valid.");
diff --git a/Tools/unzip_http.py b/Tools/unzip_http.py
new file mode 100644
index 0000000..4817a09
--- /dev/null
+++ b/Tools/unzip_http.py
@@ -0,0 +1,382 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2022 Saul Pwanson (modified by Simyon)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""
+usage: unzip_http [-h] [-l] [-f] [-o] url [files ...]
+
+Extract individual files from .zip files over http without downloading the
+entire archive. HTTP server must send `Accept-Ranges: bytes` and
+`Content-Length` in headers.
+
+positional arguments:
+ url URL of the remote zip file
+ files Files to extract. If no filenames given, displays .zip
+ contents (filenames and sizes). Each filename can be a
+ wildcard glob.
+
+options:
+ -h, --help show this help message and exit
+ -l, --list List files in the remote zip file
+ -f, --full-filepaths Recreate folder structure from zip file when extracting
+ (instead of extracting the files to the current
+ directory)
+ -o, --stdout Write files to stdout (if multiple files: concatenate
+ them to stdout, in zipfile order)
+"""
+
+import sys
+import os
+import io
+import math
+import time
+import zlib
+import struct
+import fnmatch
+import argparse
+import pathlib
+import urllib.parse
+
+
+__version__ = '0.6'
+
+
+def error(s):
+ raise Exception(s)
+
+def warning(s):
+ print(s, file=sys.stderr)
+
+def get_bits(val:int, *args):
+ 'Generate bitfields (one for each arg) from LSB to MSB.'
+ for n in args:
+ x = val & (2**n-1)
+ val >>= n
+ yield x
+
+
+class RemoteZipInfo:
+ def __init__(self, filename:str='',
+ date_time:int = 0,
+ header_offset:int = 0,
+ compress_type:int = 0,
+ compress_size:int = 0,
+ file_size:int = 0):
+ self.filename = filename
+ self.header_offset = header_offset
+ self.compress_type = compress_type
+ self.compress_size = compress_size
+ self.file_size = file_size
+
+ sec, mins, hour, day, mon, year = get_bits(date_time, 5, 6, 5, 5, 4, 7)
+ self.date_time = (year+1980, mon, day, hour, mins, sec)
+
+ def is_dir(self):
+ return self.filename.endswith('/')
+
+ def parse_extra(self, extra):
+ i = 0
+ while i < len(extra):
+ fieldid, fieldsz = struct.unpack_from('= 0:
+ magic, eocd_sz, create_ver, min_ver, disk_num, disk_start, disk_num_records, total_num_records, \
+ cdir_bytes, cdir_start = struct.unpack_from(self.fmt_eocd64, resp.data, offset=i)
+ else:
+ i = resp.data.rfind(self.magic_eocd)
+ if i >= 0:
+ magic, \
+ disk_num, disk_start, disk_num_records, total_num_records, \
+ cdir_bytes, cdir_start, comment_len = struct.unpack_from(self.fmt_eocd, resp.data, offset=i)
+
+ if cdir_start < 0 or cdir_start >= self.zip_size:
+ error('cannot find central directory')
+
+ if self.zip_size <= 65536:
+ filehdr_index = cdir_start
+ else:
+ filehdr_index = 65536 - (self.zip_size - cdir_start)
+
+ if filehdr_index < 0:
+ resp = self.get_range(cdir_start, self.zip_size - cdir_start)
+ filehdr_index = 0
+
+ cdir_end = filehdr_index + cdir_bytes
+ while filehdr_index < cdir_end:
+ sizeof_cdirentry = struct.calcsize(self.fmt_cdirentry)
+
+ magic, ver, ver_needed, flags, method, date_time, crc, \
+ complen, uncomplen, fnlen, extralen, commentlen, \
+ disknum_start, internal_attr, external_attr, local_header_ofs = \
+ struct.unpack_from(self.fmt_cdirentry, resp.data, offset=filehdr_index)
+
+ filehdr_index += sizeof_cdirentry
+
+ filename = resp.data[filehdr_index:filehdr_index+fnlen]
+ filehdr_index += fnlen
+
+ extra = resp.data[filehdr_index:filehdr_index+extralen]
+ filehdr_index += extralen
+
+ comment = resp.data[filehdr_index:filehdr_index+commentlen]
+ filehdr_index += commentlen
+
+ rzi = RemoteZipInfo(filename.decode(), date_time, local_header_ofs, method, complen, uncomplen)
+
+ rzi.parse_extra(extra)
+ yield rzi
+
+ def extract(self, member, path=None, pwd=None):
+ if pwd:
+ raise NotImplementedError('Passwords not supported yet')
+
+ path = path or pathlib.Path('.')
+
+ outpath = path/member
+ os.makedirs(outpath.parent, exist_ok=True)
+ with self.open(member) as fpin:
+ with open(path/member, mode='wb') as fpout:
+ while True:
+ r = fpin.read(65536)
+ if not r:
+ break
+ fpout.write(r)
+
+ def extractall(self, path=None, members=None, pwd=None):
+ for fn in members or self.namelist():
+ self.extract(fn, path, pwd=pwd)
+
+ def get_range(self, start, n):
+ return self.http.request('GET', self.url, headers={'Range': f'bytes={start}-{start+n-1}'}, preload_content=False)
+
+ def matching_files(self, *globs):
+ for f in self.files.values():
+ if any(fnmatch.fnmatch(f.filename, g) for g in globs):
+ yield f
+
+ def open(self, fn):
+ if isinstance(fn, str):
+ f = list(self.matching_files(fn))
+ if not f:
+ error(f'no files matching {fn}')
+ f = f[0]
+ else:
+ f = fn
+
+ sizeof_localhdr = struct.calcsize(self.fmt_localhdr)
+ r = self.get_range(f.header_offset, sizeof_localhdr)
+ localhdr = struct.unpack_from(self.fmt_localhdr, r.data)
+ magic, ver, flags, method, dos_datetime, _, _, uncomplen, fnlen, extralen = localhdr
+ if method == 0: # none
+ return self.get_range(f.header_offset + sizeof_localhdr + fnlen + extralen, f.compress_size)
+ elif method == 8: # DEFLATE
+ resp = self.get_range(f.header_offset + sizeof_localhdr + fnlen + extralen, f.compress_size)
+ return io.BufferedReader(RemoteZipStream(resp, f))
+ else:
+ error(f'unknown compression method {method}')
+
+ def open_text(self, fn):
+ return io.TextIOWrapper(self.open(fn))
+
+
+class RemoteZipStream(io.RawIOBase):
+ def __init__(self, fp, info):
+ super().__init__()
+ self.raw = fp
+ self._decompressor = zlib.decompressobj(-15)
+ self._buffer = bytes()
+
+ def readable(self):
+ return True
+
+ def readinto(self, b):
+ r = self.read(len(b))
+ b[:len(r)] = r
+ return len(r)
+
+ def read(self, n):
+ while n > len(self._buffer):
+ r = self.raw.read(2**18)
+ if not r:
+ self._buffer += self._decompressor.flush()
+ break
+ self._buffer += self._decompressor.decompress(r)
+
+ ret = self._buffer[:n]
+ self._buffer = self._buffer[n:]
+
+ return ret
+
+
+ ### script start
+
+class StreamProgress:
+ def __init__(self, fp, name='', total=0):
+ self.name = name
+ self.fp = fp
+ self.total = total
+ self.start_time = time.time()
+ self.last_update = 0
+ self.amtread = 0
+
+ def read(self, n):
+ r = self.fp.read(n)
+ self.amtread += len(r)
+ now = time.time()
+ if now - self.last_update > 0.1:
+ self.last_update = now
+
+ elapsed_s = now - self.start_time
+
+ if not r:
+ sys.stderr.write('\n')
+
+ return r
+
+
+def list_files(rzf):
+ def safelog(x):
+ return 1 if x == 0 else math.ceil(math.log10(x))
+
+ digits_compr = max(safelog(f.compress_size) for f in rzf.infolist())
+ digits_plain = max(safelog(f.file_size ) for f in rzf.infolist())
+ fmtstr = f'%{digits_compr}d -> %{digits_plain}d\t%s'
+ for f in rzf.infolist():
+ print(fmtstr % (f.compress_size, f.file_size, f.filename), file=sys.stderr)
+
+
+def extract_one(outfile, rzf, f, ofname):
+ fp = StreamProgress(rzf.open(f), name=f.filename, total=f.compress_size)
+ while r := fp.read(2**18):
+ outfile.write(r)
+
+
+def download_file(f, rzf, args):
+ if not any(fnmatch.fnmatch(f.filename, g) for g in args.files):
+ return
+
+ if args.stdout:
+ extract_one(sys.stdout.buffer, rzf, f, "stdout")
+ else:
+ path = pathlib.Path(f.filename)
+ if args.full_filepaths:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ else:
+ path = path.name
+
+ with open(str(path), 'wb') as of:
+ extract_one(of, rzf, f, str(path))
+
+
+def main():
+ parser = argparse.ArgumentParser(prog='unzip-http', \
+ description="Extract individual files from .zip files over http without downloading the entire archive. HTTP server must send `Accept-Ranges: bytes` and `Content-Length` in headers.")
+
+ parser.add_argument('-l', '--list', action='store_true', default=False,
+ help="List files in the remote zip file")
+ parser.add_argument('-f', '--full-filepaths', action='store_true', default=False,
+ help="Recreate folder structure from zip file when extracting (instead of extracting the files to the current directory)")
+ parser.add_argument('-o', '--stdout', action='store_true', default=False,
+ help="Write files to stdout (if multiple files: concatenate them to stdout, in zipfile order)")
+
+ parser.add_argument("url", nargs=1, help="URL of the remote zip file")
+ parser.add_argument("files", nargs='*', help="Files to extract. If no filenames given, displays .zip contents (filenames and sizes). Each filename can be a wildcard glob.")
+
+ args = parser.parse_args()
+
+ rzf = RemoteZipFile(args.url[0])
+ if args.list or len(args.files) == 0:
+ list_files(rzf)
+ else:
+ for f in rzf.infolist():
+ download_file(f, rzf, args)
+
+
+
+if __name__ == '__main__':
+ main()
From 4f47531a6c8334bdb7f2091aad894c2eab9676be Mon Sep 17 00:00:00 2001
From: Simon <63975668+Simyon264@users.noreply.github.com>
Date: Fri, 27 Sep 2024 14:31:51 +0200
Subject: [PATCH 2/2] Use fallback when zip extraction fails
---
.../ReplayParser/ReplayParserService.cs | 23 +++++++++----------
1 file changed, 11 insertions(+), 12 deletions(-)
diff --git a/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs b/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs
index 0b15f41..71b9a08 100644
--- a/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs
+++ b/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs
@@ -176,14 +176,7 @@ private async Task ConsumeQueue(CancellationToken token)
Replay? parsedReplay = null;
try
{
- if (!supportsRange)
- {
- var stream = await client.GetStreamAsync(replay, progress, token);
- completed++;
- Details = $"{completed}/{total}";
- parsedReplay = ParseReplay(stream, replay);
- }
- else
+ if (supportsRange)
{
try
{
@@ -197,12 +190,18 @@ private async Task ConsumeQueue(CancellationToken token)
catch (Exception e)
{
Log.Error(e, "Error while downloading " + replay);
- if (e.Message.Contains(YamlSerializerError)) return;
-
- await AddParsedReplayToDb(replay);
- return;
+ // fuck it, we ball and try the normal method
+ supportsRange = false;
}
}
+
+ if (!supportsRange)
+ {
+ var stream = await client.GetStreamAsync(replay, progress, token);
+ completed++;
+ Details = $"{completed}/{total}";
+ parsedReplay = ParseReplay(stream, replay);
+ }
}
catch (Exception e)
{