From ce3cb400cd190be3003b2e5a9dd33ce22a05b18a Mon Sep 17 00:00:00 2001 From: Simon <63975668+Simyon264@users.noreply.github.com> Date: Thu, 26 Sep 2024 20:17:56 +0200 Subject: [PATCH 1/2] Ranged requests --- ReplayBrowser/Helpers/HttpExtensions.cs | 25 +- ReplayBrowser/Helpers/ZipDownloader.cs | 43 ++ ReplayBrowser/ReplayBrowser.csproj | 8 + .../ReplayParser/ReplayParserService.cs | 52 ++- Tools/unzip_http.py | 382 ++++++++++++++++++ 5 files changed, 503 insertions(+), 7 deletions(-) create mode 100644 ReplayBrowser/Helpers/ZipDownloader.cs create mode 100644 Tools/unzip_http.py diff --git a/ReplayBrowser/Helpers/HttpExtensions.cs b/ReplayBrowser/Helpers/HttpExtensions.cs index 62f0fb7..a7c75d3 100644 --- a/ReplayBrowser/Helpers/HttpExtensions.cs +++ b/ReplayBrowser/Helpers/HttpExtensions.cs @@ -1,7 +1,30 @@ -namespace ReplayBrowser.Helpers; +using System.IO.Compression; + +namespace ReplayBrowser.Helpers; public static class HttpExtensions { + /// + /// Checks if the server supports + /// + public static async Task SupportsRangeRequests(this HttpClient client, string requestUri) + { + var request = new HttpRequestMessage(HttpMethod.Head, requestUri); + var response = await client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead); + return response.Headers.AcceptRanges.Contains("bytes"); + } + + /// + /// Returns the size of the file in bytes + /// + public static async Task GetFileSizeAsync(this HttpClient client, string requestUri) + { + var request = new HttpRequestMessage(HttpMethod.Head, requestUri); + var response = await client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead); + var contentLength = response.Content.Headers.ContentLength; + return contentLength ?? -1; + } + public static async Task GetStreamAsync(this HttpClient client, string requestUri, IProgress progress, CancellationToken token) { var response = await client.GetAsync(requestUri, HttpCompletionOption.ResponseHeadersRead, token); diff --git a/ReplayBrowser/Helpers/ZipDownloader.cs b/ReplayBrowser/Helpers/ZipDownloader.cs new file mode 100644 index 0000000..1ed5e67 --- /dev/null +++ b/ReplayBrowser/Helpers/ZipDownloader.cs @@ -0,0 +1,43 @@ + +using System.Diagnostics; +using System.Text; + +namespace ReplayBrowser.Helpers; + +public static class ZipDownloader +{ + public static async Task> ExtractFilesFromZipAsync(string zipUrl, string[] filesToExtract) + { + // ok so i first tried doing this in c#, but then saw a python script "unzip_http" that does this, so now im just gonna call that + + var files = new Dictionary(); + + foreach (var file in filesToExtract) + { + var process = new Process(); + process.StartInfo.FileName = "python"; + process.StartInfo.UseShellExecute = false; + process.StartInfo.RedirectStandardOutput = true; + process.StartInfo.RedirectStandardError = true; + var path = Path.Combine(AppDomain.CurrentDomain.BaseDirectory + "Tools/unzip_http.py"); + process.StartInfo.Arguments = $"{path} -o {zipUrl} {file}"; + + process.Start(); + + var output = await process.StandardOutput.ReadToEndAsync(); + var error = await process.StandardError.ReadToEndAsync(); + + await process.WaitForExitAsync(); + + if (process.ExitCode != 0) + { + throw new Exception($"Failed to extract files from zip: {error}"); + } + + var stream = new MemoryStream(Encoding.UTF8.GetBytes(output)); + files.Add(file, stream); + } + + return files; + } +} \ No newline at end of file diff --git a/ReplayBrowser/ReplayBrowser.csproj b/ReplayBrowser/ReplayBrowser.csproj index 964f030..fbe5de1 100644 --- a/ReplayBrowser/ReplayBrowser.csproj +++ b/ReplayBrowser/ReplayBrowser.csproj @@ -6,6 +6,8 @@ enable Debug;Release;Testing AnyCPU + + $(MSBuildThisFileDirectory)..\ @@ -33,8 +35,14 @@ + + + + + + <_ContentIncludedByDefault Remove="wwwroot\lib\bootstrap\dist\css\bootstrap-grid.css" /> <_ContentIncludedByDefault Remove="wwwroot\lib\bootstrap\dist\css\bootstrap-grid.css.map" /> diff --git a/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs b/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs index fa660d0..0b15f41 100644 --- a/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs +++ b/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs @@ -3,6 +3,7 @@ using System.IO.Compression; using System.Text; using Microsoft.EntityFrameworkCore; +using Microsoft.Net.Http.Headers; using ReplayBrowser.Data; using ReplayBrowser.Data.Models; using ReplayBrowser.Helpers; @@ -34,6 +35,11 @@ public class ReplayParserService : IHostedService, IDisposable private readonly IServiceScopeFactory _factory; + /// + /// In this case we wont just add it to the parsed replays, so it redownloads it every time. + /// + private const string YamlSerializerError = "Exception during deserialization"; + public ReplayParserService(IConfiguration configuration, IServiceScopeFactory factory) { _configuration = configuration; @@ -163,18 +169,47 @@ private async Task ConsumeQueue(CancellationToken token) }); client.DefaultRequestHeaders.Add("User-Agent", "ReplayBrowser"); Log.Information("Downloading " + replay); - var stream = await client.GetStreamAsync(replay, progress, token); - completed++; - Details = $"{completed}/{total}"; + var fileSize = await client.GetFileSizeAsync(replay); + // Check if the server supports range requests. + var supportsRange = (await client.SupportsRangeRequests(replay) && fileSize != -1); + Replay? parsedReplay = null; try { - parsedReplay = ParseReplay(stream, replay); + if (!supportsRange) + { + var stream = await client.GetStreamAsync(replay, progress, token); + completed++; + Details = $"{completed}/{total}"; + parsedReplay = ParseReplay(stream, replay); + } + else + { + try + { + // The server supports ranged processing! + string[] files = ["_replay/replay_final.yml"]; + var extractedFiles = await ZipDownloader.ExtractFilesFromZipAsync(replay, files); + completed++; + Details = $"{completed}/{total}"; + parsedReplay = FinalizeReplayParse(new StreamReader(extractedFiles["_replay/replay_final.yml"]), replay); + } + catch (Exception e) + { + Log.Error(e, "Error while downloading " + replay); + if (e.Message.Contains(YamlSerializerError)) return; + + await AddParsedReplayToDb(replay); + return; + } + } } catch (Exception e) { Log.Error(e, "Error while parsing " + replay); - await AddParsedReplayToDb(replay); // Prevent circular download eating up all resources. + if (e.Message.Contains(YamlSerializerError)) return; + + await AddParsedReplayToDb(replay); return; } // See if the link matches the date regex, if it does set the date @@ -259,12 +294,17 @@ private Replay ParseReplay(Stream stream, string replayLink) var replayStream = replayFile.Open(); var reader = new StreamReader(replayStream); + return FinalizeReplayParse(reader, replayLink); + } + + private Replay FinalizeReplayParse(StreamReader stream, string replayLink) + { var deserializer = new DeserializerBuilder() .IgnoreUnmatchedProperties() .WithNamingConvention(CamelCaseNamingConvention.Instance) .Build(); - var yamlReplay = deserializer.Deserialize(reader); + var yamlReplay = deserializer.Deserialize(stream); if (yamlReplay.Map == null && yamlReplay.Maps == null) { throw new Exception("Replay is not valid."); diff --git a/Tools/unzip_http.py b/Tools/unzip_http.py new file mode 100644 index 0000000..4817a09 --- /dev/null +++ b/Tools/unzip_http.py @@ -0,0 +1,382 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2022 Saul Pwanson (modified by Simyon) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +""" +usage: unzip_http [-h] [-l] [-f] [-o] url [files ...] + +Extract individual files from .zip files over http without downloading the +entire archive. HTTP server must send `Accept-Ranges: bytes` and +`Content-Length` in headers. + +positional arguments: + url URL of the remote zip file + files Files to extract. If no filenames given, displays .zip + contents (filenames and sizes). Each filename can be a + wildcard glob. + +options: + -h, --help show this help message and exit + -l, --list List files in the remote zip file + -f, --full-filepaths Recreate folder structure from zip file when extracting + (instead of extracting the files to the current + directory) + -o, --stdout Write files to stdout (if multiple files: concatenate + them to stdout, in zipfile order) +""" + +import sys +import os +import io +import math +import time +import zlib +import struct +import fnmatch +import argparse +import pathlib +import urllib.parse + + +__version__ = '0.6' + + +def error(s): + raise Exception(s) + +def warning(s): + print(s, file=sys.stderr) + +def get_bits(val:int, *args): + 'Generate bitfields (one for each arg) from LSB to MSB.' + for n in args: + x = val & (2**n-1) + val >>= n + yield x + + +class RemoteZipInfo: + def __init__(self, filename:str='', + date_time:int = 0, + header_offset:int = 0, + compress_type:int = 0, + compress_size:int = 0, + file_size:int = 0): + self.filename = filename + self.header_offset = header_offset + self.compress_type = compress_type + self.compress_size = compress_size + self.file_size = file_size + + sec, mins, hour, day, mon, year = get_bits(date_time, 5, 6, 5, 5, 4, 7) + self.date_time = (year+1980, mon, day, hour, mins, sec) + + def is_dir(self): + return self.filename.endswith('/') + + def parse_extra(self, extra): + i = 0 + while i < len(extra): + fieldid, fieldsz = struct.unpack_from('= 0: + magic, eocd_sz, create_ver, min_ver, disk_num, disk_start, disk_num_records, total_num_records, \ + cdir_bytes, cdir_start = struct.unpack_from(self.fmt_eocd64, resp.data, offset=i) + else: + i = resp.data.rfind(self.magic_eocd) + if i >= 0: + magic, \ + disk_num, disk_start, disk_num_records, total_num_records, \ + cdir_bytes, cdir_start, comment_len = struct.unpack_from(self.fmt_eocd, resp.data, offset=i) + + if cdir_start < 0 or cdir_start >= self.zip_size: + error('cannot find central directory') + + if self.zip_size <= 65536: + filehdr_index = cdir_start + else: + filehdr_index = 65536 - (self.zip_size - cdir_start) + + if filehdr_index < 0: + resp = self.get_range(cdir_start, self.zip_size - cdir_start) + filehdr_index = 0 + + cdir_end = filehdr_index + cdir_bytes + while filehdr_index < cdir_end: + sizeof_cdirentry = struct.calcsize(self.fmt_cdirentry) + + magic, ver, ver_needed, flags, method, date_time, crc, \ + complen, uncomplen, fnlen, extralen, commentlen, \ + disknum_start, internal_attr, external_attr, local_header_ofs = \ + struct.unpack_from(self.fmt_cdirentry, resp.data, offset=filehdr_index) + + filehdr_index += sizeof_cdirentry + + filename = resp.data[filehdr_index:filehdr_index+fnlen] + filehdr_index += fnlen + + extra = resp.data[filehdr_index:filehdr_index+extralen] + filehdr_index += extralen + + comment = resp.data[filehdr_index:filehdr_index+commentlen] + filehdr_index += commentlen + + rzi = RemoteZipInfo(filename.decode(), date_time, local_header_ofs, method, complen, uncomplen) + + rzi.parse_extra(extra) + yield rzi + + def extract(self, member, path=None, pwd=None): + if pwd: + raise NotImplementedError('Passwords not supported yet') + + path = path or pathlib.Path('.') + + outpath = path/member + os.makedirs(outpath.parent, exist_ok=True) + with self.open(member) as fpin: + with open(path/member, mode='wb') as fpout: + while True: + r = fpin.read(65536) + if not r: + break + fpout.write(r) + + def extractall(self, path=None, members=None, pwd=None): + for fn in members or self.namelist(): + self.extract(fn, path, pwd=pwd) + + def get_range(self, start, n): + return self.http.request('GET', self.url, headers={'Range': f'bytes={start}-{start+n-1}'}, preload_content=False) + + def matching_files(self, *globs): + for f in self.files.values(): + if any(fnmatch.fnmatch(f.filename, g) for g in globs): + yield f + + def open(self, fn): + if isinstance(fn, str): + f = list(self.matching_files(fn)) + if not f: + error(f'no files matching {fn}') + f = f[0] + else: + f = fn + + sizeof_localhdr = struct.calcsize(self.fmt_localhdr) + r = self.get_range(f.header_offset, sizeof_localhdr) + localhdr = struct.unpack_from(self.fmt_localhdr, r.data) + magic, ver, flags, method, dos_datetime, _, _, uncomplen, fnlen, extralen = localhdr + if method == 0: # none + return self.get_range(f.header_offset + sizeof_localhdr + fnlen + extralen, f.compress_size) + elif method == 8: # DEFLATE + resp = self.get_range(f.header_offset + sizeof_localhdr + fnlen + extralen, f.compress_size) + return io.BufferedReader(RemoteZipStream(resp, f)) + else: + error(f'unknown compression method {method}') + + def open_text(self, fn): + return io.TextIOWrapper(self.open(fn)) + + +class RemoteZipStream(io.RawIOBase): + def __init__(self, fp, info): + super().__init__() + self.raw = fp + self._decompressor = zlib.decompressobj(-15) + self._buffer = bytes() + + def readable(self): + return True + + def readinto(self, b): + r = self.read(len(b)) + b[:len(r)] = r + return len(r) + + def read(self, n): + while n > len(self._buffer): + r = self.raw.read(2**18) + if not r: + self._buffer += self._decompressor.flush() + break + self._buffer += self._decompressor.decompress(r) + + ret = self._buffer[:n] + self._buffer = self._buffer[n:] + + return ret + + + ### script start + +class StreamProgress: + def __init__(self, fp, name='', total=0): + self.name = name + self.fp = fp + self.total = total + self.start_time = time.time() + self.last_update = 0 + self.amtread = 0 + + def read(self, n): + r = self.fp.read(n) + self.amtread += len(r) + now = time.time() + if now - self.last_update > 0.1: + self.last_update = now + + elapsed_s = now - self.start_time + + if not r: + sys.stderr.write('\n') + + return r + + +def list_files(rzf): + def safelog(x): + return 1 if x == 0 else math.ceil(math.log10(x)) + + digits_compr = max(safelog(f.compress_size) for f in rzf.infolist()) + digits_plain = max(safelog(f.file_size ) for f in rzf.infolist()) + fmtstr = f'%{digits_compr}d -> %{digits_plain}d\t%s' + for f in rzf.infolist(): + print(fmtstr % (f.compress_size, f.file_size, f.filename), file=sys.stderr) + + +def extract_one(outfile, rzf, f, ofname): + fp = StreamProgress(rzf.open(f), name=f.filename, total=f.compress_size) + while r := fp.read(2**18): + outfile.write(r) + + +def download_file(f, rzf, args): + if not any(fnmatch.fnmatch(f.filename, g) for g in args.files): + return + + if args.stdout: + extract_one(sys.stdout.buffer, rzf, f, "stdout") + else: + path = pathlib.Path(f.filename) + if args.full_filepaths: + path.parent.mkdir(parents=True, exist_ok=True) + else: + path = path.name + + with open(str(path), 'wb') as of: + extract_one(of, rzf, f, str(path)) + + +def main(): + parser = argparse.ArgumentParser(prog='unzip-http', \ + description="Extract individual files from .zip files over http without downloading the entire archive. HTTP server must send `Accept-Ranges: bytes` and `Content-Length` in headers.") + + parser.add_argument('-l', '--list', action='store_true', default=False, + help="List files in the remote zip file") + parser.add_argument('-f', '--full-filepaths', action='store_true', default=False, + help="Recreate folder structure from zip file when extracting (instead of extracting the files to the current directory)") + parser.add_argument('-o', '--stdout', action='store_true', default=False, + help="Write files to stdout (if multiple files: concatenate them to stdout, in zipfile order)") + + parser.add_argument("url", nargs=1, help="URL of the remote zip file") + parser.add_argument("files", nargs='*', help="Files to extract. If no filenames given, displays .zip contents (filenames and sizes). Each filename can be a wildcard glob.") + + args = parser.parse_args() + + rzf = RemoteZipFile(args.url[0]) + if args.list or len(args.files) == 0: + list_files(rzf) + else: + for f in rzf.infolist(): + download_file(f, rzf, args) + + + +if __name__ == '__main__': + main() From 4f47531a6c8334bdb7f2091aad894c2eab9676be Mon Sep 17 00:00:00 2001 From: Simon <63975668+Simyon264@users.noreply.github.com> Date: Fri, 27 Sep 2024 14:31:51 +0200 Subject: [PATCH 2/2] Use fallback when zip extraction fails --- .../ReplayParser/ReplayParserService.cs | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs b/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs index 0b15f41..71b9a08 100644 --- a/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs +++ b/ReplayBrowser/Services/ReplayParser/ReplayParserService.cs @@ -176,14 +176,7 @@ private async Task ConsumeQueue(CancellationToken token) Replay? parsedReplay = null; try { - if (!supportsRange) - { - var stream = await client.GetStreamAsync(replay, progress, token); - completed++; - Details = $"{completed}/{total}"; - parsedReplay = ParseReplay(stream, replay); - } - else + if (supportsRange) { try { @@ -197,12 +190,18 @@ private async Task ConsumeQueue(CancellationToken token) catch (Exception e) { Log.Error(e, "Error while downloading " + replay); - if (e.Message.Contains(YamlSerializerError)) return; - - await AddParsedReplayToDb(replay); - return; + // fuck it, we ball and try the normal method + supportsRange = false; } } + + if (!supportsRange) + { + var stream = await client.GetStreamAsync(replay, progress, token); + completed++; + Details = $"{completed}/{total}"; + parsedReplay = ParseReplay(stream, replay); + } } catch (Exception e) {