diff --git a/PaheScrapper/App.config b/PaheScrapper/App.config index dea4e55..07381d8 100644 --- a/PaheScrapper/App.config +++ b/PaheScrapper/App.config @@ -1,12 +1,12 @@ - + - +
- + @@ -26,10 +26,10 @@ 8 - 50 + 5 - 10 + 1 True @@ -51,4 +51,12 @@ + + + + + + + + diff --git a/PaheScrapper/Helpers/ConsoleHelper.cs b/PaheScrapper/Helpers/ConsoleHelper.cs index 5c62ad6..c0921ba 100644 --- a/PaheScrapper/Helpers/ConsoleHelper.cs +++ b/PaheScrapper/Helpers/ConsoleHelper.cs @@ -28,6 +28,14 @@ public static void LogBranch(string text) LogWriter lw = new LogWriter("Branch: " + text); } + public static void LogTime(TimeSpan timeSpan, TimeSpan totalTimeSpan) + { + Console.BackgroundColor = ConsoleColor.DarkMagenta; + Console.ForegroundColor = ConsoleColor.Yellow; + Console.WriteLine($"{(int)timeSpan.TotalMilliseconds} ms - {totalTimeSpan:G}"); + LogWriter lw = new LogWriter("Time: " + $"{(int)timeSpan.TotalMilliseconds} ms - {totalTimeSpan:G}"); + } + public static void LogStats(string text) { Console.BackgroundColor = ConsoleColor.DarkBlue; diff --git a/PaheScrapper/Helpers/StringCompressor.cs b/PaheScrapper/Helpers/StringCompressor.cs new file mode 100644 index 0000000..d1e6ebf --- /dev/null +++ b/PaheScrapper/Helpers/StringCompressor.cs @@ -0,0 +1,60 @@ +using System; +using System.IO; +using System.IO.Compression; +using System.Text; + +namespace PaheScrapper.Helpers +{ + internal static class StringCompressor + { + /// + /// Compresses the string. + /// + /// The text. + /// + public static string CompressString(string text) + { + byte[] buffer = Encoding.UTF8.GetBytes(text); + var memoryStream = new MemoryStream(); + using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Compress, true)) + { + gZipStream.Write(buffer, 0, buffer.Length); + } + + memoryStream.Position = 0; + + var compressedData = new byte[memoryStream.Length]; + memoryStream.Read(compressedData, 0, compressedData.Length); + + var gZipBuffer = new byte[compressedData.Length + 4]; + Buffer.BlockCopy(compressedData, 0, gZipBuffer, 4, compressedData.Length); + Buffer.BlockCopy(BitConverter.GetBytes(buffer.Length), 0, gZipBuffer, 0, 4); + return Convert.ToBase64String(gZipBuffer); + } + + /// + /// Decompresses the string. + /// + /// The compressed text. + /// + public static string DecompressString(string compressedText) + { + byte[] gZipBuffer = Convert.FromBase64String(compressedText); + using (var memoryStream = new MemoryStream()) + { + int dataLength = BitConverter.ToInt32(gZipBuffer, 0); + memoryStream.Write(gZipBuffer, 4, gZipBuffer.Length - 4); + + var buffer = new byte[dataLength]; + + memoryStream.Position = 0; + using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Decompress)) + { + gZipStream.Read(buffer, 0, buffer.Length); + } + + return Encoding.UTF8.GetString(buffer); + } + } + } +} \ No newline at end of file diff --git a/PaheScrapper/Helpers/WebErrorReporter.cs b/PaheScrapper/Helpers/WebErrorReporter.cs new file mode 100644 index 0000000..46d2578 --- /dev/null +++ b/PaheScrapper/Helpers/WebErrorReporter.cs @@ -0,0 +1,19 @@ +using System.Net; +using HtmlAgilityPack; +using Newtonsoft.Json; + +namespace PaheScrapper.Helpers +{ + public static class WebErrorReporter + { + public static void HttpError(WebResponse webResponse) + { + LogWriter lw = new LogWriter("Http Dump: \n" + JsonConvert.SerializeObject(webResponse)); + } + + public static void HtmlError(HtmlDocument htmlDocument) + { + LogWriter lw = new LogWriter("Html Dump: \n" + StringCompressor.CompressString(htmlDocument.DocumentNode.InnerHtml)); + } + } +} \ No newline at end of file diff --git a/PaheScrapper/PaheScrapper.csproj b/PaheScrapper/PaheScrapper.csproj index 1946d5a..d50093e 100644 --- a/PaheScrapper/PaheScrapper.csproj +++ b/PaheScrapper/PaheScrapper.csproj @@ -68,9 +68,18 @@ ..\packages\DotNetSeleniumExtras.WaitHelpers.3.11.0\lib\net45\SeleniumExtras.WaitHelpers.dll + + ..\packages\System.Buffers.4.5.1\lib\netstandard1.1\System.Buffers.dll + + + ..\packages\System.Memory.4.5.4\lib\netstandard1.1\System.Memory.dll + + + ..\packages\System.Runtime.CompilerServices.Unsafe.4.5.3\lib\netstandard1.0\System.Runtime.CompilerServices.Unsafe.dll + @@ -91,8 +100,10 @@ + + diff --git a/PaheScrapper/Program.cs b/PaheScrapper/Program.cs index 8aa7ff7..9651ace 100644 --- a/PaheScrapper/Program.cs +++ b/PaheScrapper/Program.cs @@ -84,20 +84,20 @@ static void FullScrape(string command = null) if (command == "continue") { f = File.OpenText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename)); - _manager = ScrapperManager.Deserialize(f.ReadToEnd()); + _manager = ScrapperManager.Deserialize(StringCompressor.DecompressString(f.ReadToEnd())); f.Close(); } else if (command == "resync") { f = File.OpenText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename)); - _manager = ScrapperManager.Deserialize(f.ReadToEnd()); + _manager = ScrapperManager.Deserialize(StringCompressor.DecompressString(f.ReadToEnd())); _manager.ResetState(); f.Close(); } else if (command == "loop") { f = File.OpenText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename)); - _manager = ScrapperManager.Deserialize(f.ReadToEnd()); + _manager = ScrapperManager.Deserialize(StringCompressor.DecompressString(f.ReadToEnd())); f.Close(); isLoop = true; } @@ -108,9 +108,9 @@ static void FullScrape(string command = null) else if (command == "state") { f = File.OpenText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename)); - _manager = ScrapperManager.Deserialize(f.ReadToEnd()); + _manager = ScrapperManager.Deserialize(StringCompressor.DecompressString(f.ReadToEnd())); f.Close(); - ConsoleHelper.LogCommandHandled($"Current State = {_manager.State.ToString()}"); + ConsoleHelper.LogCommandHandled($"Current State = {_manager.State}"); _manager = null; command = null; goto recommand; @@ -164,7 +164,7 @@ static void FullScrape(string command = null) using (var file = File.CreateText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename))) { - file.WriteAsync(_manager.Serialize()).Wait(); + file.WriteAsync(StringCompressor.CompressString(_manager.Serialize())).Wait(); file.Close(); } @@ -209,7 +209,7 @@ static void FullScrape(string command = null) using (var file = File.CreateText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename))) { - file.WriteAsync(_manager.Serialize()).Wait(); + file.WriteAsync(StringCompressor.CompressString(_manager.Serialize())).Wait(); file.Close(); } @@ -232,7 +232,7 @@ static void FullScrape(string command = null) ConsoleHelper.LogStorage("Final Save Scraper State"); using (var file = File.CreateText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename))) { - file.WriteAsync(_manager.Serialize()).Wait(); + file.WriteAsync(StringCompressor.CompressString(_manager.Serialize())).Wait(); file.Close(); } diff --git a/PaheScrapper/Properties/Configuration.Designer.cs b/PaheScrapper/Properties/Configuration.Designer.cs index e39bc42..1bb1de1 100644 --- a/PaheScrapper/Properties/Configuration.Designer.cs +++ b/PaheScrapper/Properties/Configuration.Designer.cs @@ -85,7 +85,7 @@ public int WebDriveInstances { [global::System.Configuration.UserScopedSettingAttribute()] [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] - [global::System.Configuration.DefaultSettingValueAttribute("50")] + [global::System.Configuration.DefaultSettingValueAttribute("5")] public int HTMLSaveStateThershold { get { return ((int)(this["HTMLSaveStateThershold"])); @@ -97,7 +97,7 @@ public int HTMLSaveStateThershold { [global::System.Configuration.UserScopedSettingAttribute()] [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] - [global::System.Configuration.DefaultSettingValueAttribute("10")] + [global::System.Configuration.DefaultSettingValueAttribute("1")] public int WebDriveSaveStateThershold { get { return ((int)(this["WebDriveSaveStateThershold"])); diff --git a/PaheScrapper/Properties/Configuration.settings b/PaheScrapper/Properties/Configuration.settings index 21a0b93..8907769 100644 --- a/PaheScrapper/Properties/Configuration.settings +++ b/PaheScrapper/Properties/Configuration.settings @@ -18,10 +18,10 @@ 8 - 50 + 5 - 10 + 1 True diff --git a/PaheScrapper/ScrapperConstants.cs b/PaheScrapper/ScrapperConstants.cs index d1e749b..7b598c2 100644 --- a/PaheScrapper/ScrapperConstants.cs +++ b/PaheScrapper/ScrapperConstants.cs @@ -3,7 +3,7 @@ public static class ScrapperConstants { public static string WebsiteLanding() => "https://pahe.ph/"; - public static string WebsiteLandingPaging(int page) => page <= 1 ? WebsiteLanding() : $"https://pahe.ph/page/{page}/"; + public static string WebsiteLandingPaging(int page) => page < 1 ? WebsiteLanding() : $"https://pahe.ph/page/{page + 1}/"; public static int HttpRequestTimeout() => 15 * 1000; } } \ No newline at end of file diff --git a/PaheScrapper/ScrapperManager.cs b/PaheScrapper/ScrapperManager.cs index dc959cc..dba3a9d 100644 --- a/PaheScrapper/ScrapperManager.cs +++ b/PaheScrapper/ScrapperManager.cs @@ -17,6 +17,7 @@ public class ScrapperManager private int _maxPage; private readonly WebsiteContext _websiteContext; private WebRequestHeader _webRequestHeader; + private TimeSpan _totalTimeSpan; public ScrapperManager() { @@ -24,6 +25,7 @@ public ScrapperManager() _maxPage = _currentPage; _scrapperState = ScrapperState.Initiate; _websiteContext = new WebsiteContext(); + _totalTimeSpan = TimeSpan.Zero; } public WebsiteContext Context => _websiteContext; @@ -71,11 +73,15 @@ void PersistWebDriveState(bool transition) HtmlDocument htmlDocument = null; + var initEntryDateTime = DateTime.MinValue; + if (_scrapperState == ScrapperState.Initiate) { int retryCount = 0; int retryLimit = Configuration.Default.HtmlRetryLimit; + initEntryDateTime = DateTime.Now; + BypassSurcuriRoutine(); retry: @@ -86,7 +92,9 @@ void PersistWebDriveState(bool transition) } catch (Exception e) { - if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Cannot Get Response.")) + WebErrorReporter.HtmlError(htmlDocument); + + if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Cannot Get Response.") || e.Message.Contains("Sequence contains no matching element")) { ConsoleHelper.LogError(e.Message); @@ -98,7 +106,7 @@ void PersistWebDriveState(bool transition) { retryCount++; - if (e.Message.Contains("Input string was not in a correct format.")) + if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Sequence contains no matching element")) BypassSurcuriRoutine(); goto retry; @@ -155,6 +163,10 @@ void PersistWebDriveState(bool transition) ++_currentPage; + var elapsedTime = DateTime.Now.Subtract(initEntryDateTime); + _totalTimeSpan += elapsedTime; + ConsoleHelper.LogTime(elapsedTime, _totalTimeSpan); + if (newMoviesList.Count == 0) goto summeryFinish; } @@ -167,6 +179,7 @@ void PersistWebDriveState(bool transition) { _currentPage = i; ConsoleHelper.LogInfo($"Page: {_currentPage + 1}/{_maxPage}"); + initEntryDateTime = DateTime.Now; retry: try @@ -187,7 +200,9 @@ void PersistWebDriveState(bool transition) } catch (Exception e) { - if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Cannot Get Response.")) + WebErrorReporter.HtmlError(htmlDocument); + + if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Cannot Get Response.") || e.Message.Contains("Sequence contains no matching element")) { ConsoleHelper.LogError(e.Message); @@ -199,7 +214,7 @@ void PersistWebDriveState(bool transition) { retryCount++; - if (e.Message.Contains("Input string was not in a correct format.")) + if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Sequence contains no matching element")) BypassSurcuriRoutine(); goto retry; @@ -234,6 +249,10 @@ void PersistWebDriveState(bool transition) throw e; } } + + var elapsedTime = DateTime.Now.Subtract(initEntryDateTime); + _totalTimeSpan += elapsedTime; + ConsoleHelper.LogTime(elapsedTime, _totalTimeSpan); } summeryFinish: @@ -253,6 +272,7 @@ void PersistWebDriveState(bool transition) int retryLimit = Configuration.Default.HtmlRetryLimit; _currentPage = i; ConsoleHelper.LogInfo($"Page: {_currentPage + 1}/{_maxPage}"); + initEntryDateTime = DateTime.Now; var movie = _websiteContext.MovieSummeries[i]; @@ -368,7 +388,9 @@ void PersistWebDriveState(bool transition) } catch (Exception e) { - if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Cannot Get Response.")) + WebErrorReporter.HtmlError(htmlDocument); + + if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Cannot Get Response.") || e.Message.Contains("Sequence contains no matching element")) { ConsoleHelper.LogError(e.Message); @@ -380,7 +402,7 @@ void PersistWebDriveState(bool transition) { retryCount++; - if (e.Message.Contains("Input string was not in a correct format.")) + if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Sequence contains no matching element")) BypassSurcuriRoutine(); goto retry; @@ -415,6 +437,10 @@ void PersistWebDriveState(bool transition) throw e; } } + + var elapsedTime = DateTime.Now.Subtract(initEntryDateTime); + _totalTimeSpan += elapsedTime; + ConsoleHelper.LogTime(elapsedTime, _totalTimeSpan); } _scrapperState = ScrapperState.Sora; diff --git a/PaheScrapper/ScrapperMethods.cs b/PaheScrapper/ScrapperMethods.cs index 566edc6..ba3d14a 100644 --- a/PaheScrapper/ScrapperMethods.cs +++ b/PaheScrapper/ScrapperMethods.cs @@ -229,7 +229,7 @@ void ProcessDownloadBox(MovieDetails tmp_details, HtmlNode tmp_downloadNode, str string qualityNote = null; foreach (var downloadHtml in downloadHtmls) - { + { var tmp_downloadHtml = downloadHtml.Replace(" ", "").TrimStart().TrimEnd(); MemoryStream ms = new MemoryStream(); @@ -347,7 +347,10 @@ void ProcessDownloadBox(MovieDetails tmp_details, HtmlNode tmp_downloadNode, str if (qualityNote != null) qualityNote = null; - downloadLinkNodes = downloadLinkNodes.Where(l => l.Contains("{{")).ToArray(); + downloadLinkNodes = downloadLinkNodes + .Where(l => l.Contains("{{")) + .Select(l=>l.Substring(0, l.LastIndexOf("}}", StringComparison.Ordinal) + 2)) + .ToArray(); foreach (var downloadLinkNode in downloadLinkNodes) { @@ -355,7 +358,6 @@ void ProcessDownloadBox(MovieDetails tmp_details, HtmlNode tmp_downloadNode, str } episode.DownloadQualities.Add(downloadQuality); - //Console.WriteLine($"Q|={downloadQuality.Quality}\n\tS|={downloadQuality.Size}\n\t\tN|={downloadQuality.Notes}"); } tmp_details.Episodes.Add(episode); diff --git a/PaheScrapper/ScrapperWeb.cs b/PaheScrapper/ScrapperWeb.cs index 21a86fe..576511f 100644 --- a/PaheScrapper/ScrapperWeb.cs +++ b/PaheScrapper/ScrapperWeb.cs @@ -39,6 +39,7 @@ public static HtmlDocument GetDownloadHtml(string url, WebRequestHeader header) } catch (WebException e) { + WebErrorReporter.HttpError(e.Response); throw new ScrapperDownloaderException("Cannot Get Response.", e); } @@ -93,8 +94,9 @@ public static HtmlDocument PostDownloadHtml(string url, Dictionary + + + \ No newline at end of file