diff --git a/PaheScrapper/App.config b/PaheScrapper/App.config
index dea4e55..07381d8 100644
--- a/PaheScrapper/App.config
+++ b/PaheScrapper/App.config
@@ -1,12 +1,12 @@
-
+
-
+
-
+
@@ -26,10 +26,10 @@
8
- 50
+ 5
- 10
+ 1
True
@@ -51,4 +51,12 @@
+
+
+
+
+
+
+
+
diff --git a/PaheScrapper/Helpers/ConsoleHelper.cs b/PaheScrapper/Helpers/ConsoleHelper.cs
index 5c62ad6..c0921ba 100644
--- a/PaheScrapper/Helpers/ConsoleHelper.cs
+++ b/PaheScrapper/Helpers/ConsoleHelper.cs
@@ -28,6 +28,14 @@ public static void LogBranch(string text)
LogWriter lw = new LogWriter("Branch: " + text);
}
+ public static void LogTime(TimeSpan timeSpan, TimeSpan totalTimeSpan)
+ {
+ Console.BackgroundColor = ConsoleColor.DarkMagenta;
+ Console.ForegroundColor = ConsoleColor.Yellow;
+ Console.WriteLine($"{(int)timeSpan.TotalMilliseconds} ms - {totalTimeSpan:G}");
+ LogWriter lw = new LogWriter("Time: " + $"{(int)timeSpan.TotalMilliseconds} ms - {totalTimeSpan:G}");
+ }
+
public static void LogStats(string text)
{
Console.BackgroundColor = ConsoleColor.DarkBlue;
diff --git a/PaheScrapper/Helpers/StringCompressor.cs b/PaheScrapper/Helpers/StringCompressor.cs
new file mode 100644
index 0000000..d1e6ebf
--- /dev/null
+++ b/PaheScrapper/Helpers/StringCompressor.cs
@@ -0,0 +1,60 @@
+using System;
+using System.IO;
+using System.IO.Compression;
+using System.Text;
+
+namespace PaheScrapper.Helpers
+{
+ internal static class StringCompressor
+ {
+ ///
+ /// Compresses the string.
+ ///
+ /// The text.
+ ///
+ public static string CompressString(string text)
+ {
+ byte[] buffer = Encoding.UTF8.GetBytes(text);
+ var memoryStream = new MemoryStream();
+ using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Compress, true))
+ {
+ gZipStream.Write(buffer, 0, buffer.Length);
+ }
+
+ memoryStream.Position = 0;
+
+ var compressedData = new byte[memoryStream.Length];
+ memoryStream.Read(compressedData, 0, compressedData.Length);
+
+ var gZipBuffer = new byte[compressedData.Length + 4];
+ Buffer.BlockCopy(compressedData, 0, gZipBuffer, 4, compressedData.Length);
+ Buffer.BlockCopy(BitConverter.GetBytes(buffer.Length), 0, gZipBuffer, 0, 4);
+ return Convert.ToBase64String(gZipBuffer);
+ }
+
+ ///
+ /// Decompresses the string.
+ ///
+ /// The compressed text.
+ ///
+ public static string DecompressString(string compressedText)
+ {
+ byte[] gZipBuffer = Convert.FromBase64String(compressedText);
+ using (var memoryStream = new MemoryStream())
+ {
+ int dataLength = BitConverter.ToInt32(gZipBuffer, 0);
+ memoryStream.Write(gZipBuffer, 4, gZipBuffer.Length - 4);
+
+ var buffer = new byte[dataLength];
+
+ memoryStream.Position = 0;
+ using (var gZipStream = new GZipStream(memoryStream, CompressionMode.Decompress))
+ {
+ gZipStream.Read(buffer, 0, buffer.Length);
+ }
+
+ return Encoding.UTF8.GetString(buffer);
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/PaheScrapper/Helpers/WebErrorReporter.cs b/PaheScrapper/Helpers/WebErrorReporter.cs
new file mode 100644
index 0000000..46d2578
--- /dev/null
+++ b/PaheScrapper/Helpers/WebErrorReporter.cs
@@ -0,0 +1,19 @@
+using System.Net;
+using HtmlAgilityPack;
+using Newtonsoft.Json;
+
+namespace PaheScrapper.Helpers
+{
+ public static class WebErrorReporter
+ {
+ public static void HttpError(WebResponse webResponse)
+ {
+ LogWriter lw = new LogWriter("Http Dump: \n" + JsonConvert.SerializeObject(webResponse));
+ }
+
+ public static void HtmlError(HtmlDocument htmlDocument)
+ {
+ LogWriter lw = new LogWriter("Html Dump: \n" + StringCompressor.CompressString(htmlDocument.DocumentNode.InnerHtml));
+ }
+ }
+}
\ No newline at end of file
diff --git a/PaheScrapper/PaheScrapper.csproj b/PaheScrapper/PaheScrapper.csproj
index 1946d5a..d50093e 100644
--- a/PaheScrapper/PaheScrapper.csproj
+++ b/PaheScrapper/PaheScrapper.csproj
@@ -68,9 +68,18 @@
..\packages\DotNetSeleniumExtras.WaitHelpers.3.11.0\lib\net45\SeleniumExtras.WaitHelpers.dll
+
+ ..\packages\System.Buffers.4.5.1\lib\netstandard1.1\System.Buffers.dll
+
+
+ ..\packages\System.Memory.4.5.4\lib\netstandard1.1\System.Memory.dll
+
+
+ ..\packages\System.Runtime.CompilerServices.Unsafe.4.5.3\lib\netstandard1.0\System.Runtime.CompilerServices.Unsafe.dll
+
@@ -91,8 +100,10 @@
+
+
diff --git a/PaheScrapper/Program.cs b/PaheScrapper/Program.cs
index 8aa7ff7..9651ace 100644
--- a/PaheScrapper/Program.cs
+++ b/PaheScrapper/Program.cs
@@ -84,20 +84,20 @@ static void FullScrape(string command = null)
if (command == "continue")
{
f = File.OpenText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename));
- _manager = ScrapperManager.Deserialize(f.ReadToEnd());
+ _manager = ScrapperManager.Deserialize(StringCompressor.DecompressString(f.ReadToEnd()));
f.Close();
}
else if (command == "resync")
{
f = File.OpenText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename));
- _manager = ScrapperManager.Deserialize(f.ReadToEnd());
+ _manager = ScrapperManager.Deserialize(StringCompressor.DecompressString(f.ReadToEnd()));
_manager.ResetState();
f.Close();
}
else if (command == "loop")
{
f = File.OpenText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename));
- _manager = ScrapperManager.Deserialize(f.ReadToEnd());
+ _manager = ScrapperManager.Deserialize(StringCompressor.DecompressString(f.ReadToEnd()));
f.Close();
isLoop = true;
}
@@ -108,9 +108,9 @@ static void FullScrape(string command = null)
else if (command == "state")
{
f = File.OpenText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename));
- _manager = ScrapperManager.Deserialize(f.ReadToEnd());
+ _manager = ScrapperManager.Deserialize(StringCompressor.DecompressString(f.ReadToEnd()));
f.Close();
- ConsoleHelper.LogCommandHandled($"Current State = {_manager.State.ToString()}");
+ ConsoleHelper.LogCommandHandled($"Current State = {_manager.State}");
_manager = null;
command = null;
goto recommand;
@@ -164,7 +164,7 @@ static void FullScrape(string command = null)
using (var file = File.CreateText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename)))
{
- file.WriteAsync(_manager.Serialize()).Wait();
+ file.WriteAsync(StringCompressor.CompressString(_manager.Serialize())).Wait();
file.Close();
}
@@ -209,7 +209,7 @@ static void FullScrape(string command = null)
using (var file =
File.CreateText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename)))
{
- file.WriteAsync(_manager.Serialize()).Wait();
+ file.WriteAsync(StringCompressor.CompressString(_manager.Serialize())).Wait();
file.Close();
}
@@ -232,7 +232,7 @@ static void FullScrape(string command = null)
ConsoleHelper.LogStorage("Final Save Scraper State");
using (var file = File.CreateText(Path.Combine(_folderPath, Configuration.Default.ManagerStateFilename)))
{
- file.WriteAsync(_manager.Serialize()).Wait();
+ file.WriteAsync(StringCompressor.CompressString(_manager.Serialize())).Wait();
file.Close();
}
diff --git a/PaheScrapper/Properties/Configuration.Designer.cs b/PaheScrapper/Properties/Configuration.Designer.cs
index e39bc42..1bb1de1 100644
--- a/PaheScrapper/Properties/Configuration.Designer.cs
+++ b/PaheScrapper/Properties/Configuration.Designer.cs
@@ -85,7 +85,7 @@ public int WebDriveInstances {
[global::System.Configuration.UserScopedSettingAttribute()]
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
- [global::System.Configuration.DefaultSettingValueAttribute("50")]
+ [global::System.Configuration.DefaultSettingValueAttribute("5")]
public int HTMLSaveStateThershold {
get {
return ((int)(this["HTMLSaveStateThershold"]));
@@ -97,7 +97,7 @@ public int HTMLSaveStateThershold {
[global::System.Configuration.UserScopedSettingAttribute()]
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
- [global::System.Configuration.DefaultSettingValueAttribute("10")]
+ [global::System.Configuration.DefaultSettingValueAttribute("1")]
public int WebDriveSaveStateThershold {
get {
return ((int)(this["WebDriveSaveStateThershold"]));
diff --git a/PaheScrapper/Properties/Configuration.settings b/PaheScrapper/Properties/Configuration.settings
index 21a0b93..8907769 100644
--- a/PaheScrapper/Properties/Configuration.settings
+++ b/PaheScrapper/Properties/Configuration.settings
@@ -18,10 +18,10 @@
8
- 50
+ 5
- 10
+ 1
True
diff --git a/PaheScrapper/ScrapperConstants.cs b/PaheScrapper/ScrapperConstants.cs
index d1e749b..7b598c2 100644
--- a/PaheScrapper/ScrapperConstants.cs
+++ b/PaheScrapper/ScrapperConstants.cs
@@ -3,7 +3,7 @@
public static class ScrapperConstants
{
public static string WebsiteLanding() => "https://pahe.ph/";
- public static string WebsiteLandingPaging(int page) => page <= 1 ? WebsiteLanding() : $"https://pahe.ph/page/{page}/";
+ public static string WebsiteLandingPaging(int page) => page < 1 ? WebsiteLanding() : $"https://pahe.ph/page/{page + 1}/";
public static int HttpRequestTimeout() => 15 * 1000;
}
}
\ No newline at end of file
diff --git a/PaheScrapper/ScrapperManager.cs b/PaheScrapper/ScrapperManager.cs
index dc959cc..dba3a9d 100644
--- a/PaheScrapper/ScrapperManager.cs
+++ b/PaheScrapper/ScrapperManager.cs
@@ -17,6 +17,7 @@ public class ScrapperManager
private int _maxPage;
private readonly WebsiteContext _websiteContext;
private WebRequestHeader _webRequestHeader;
+ private TimeSpan _totalTimeSpan;
public ScrapperManager()
{
@@ -24,6 +25,7 @@ public ScrapperManager()
_maxPage = _currentPage;
_scrapperState = ScrapperState.Initiate;
_websiteContext = new WebsiteContext();
+ _totalTimeSpan = TimeSpan.Zero;
}
public WebsiteContext Context => _websiteContext;
@@ -71,11 +73,15 @@ void PersistWebDriveState(bool transition)
HtmlDocument htmlDocument = null;
+ var initEntryDateTime = DateTime.MinValue;
+
if (_scrapperState == ScrapperState.Initiate)
{
int retryCount = 0;
int retryLimit = Configuration.Default.HtmlRetryLimit;
+ initEntryDateTime = DateTime.Now;
+
BypassSurcuriRoutine();
retry:
@@ -86,7 +92,9 @@ void PersistWebDriveState(bool transition)
}
catch (Exception e)
{
- if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Cannot Get Response."))
+ WebErrorReporter.HtmlError(htmlDocument);
+
+ if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Cannot Get Response.") || e.Message.Contains("Sequence contains no matching element"))
{
ConsoleHelper.LogError(e.Message);
@@ -98,7 +106,7 @@ void PersistWebDriveState(bool transition)
{
retryCount++;
- if (e.Message.Contains("Input string was not in a correct format."))
+ if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Sequence contains no matching element"))
BypassSurcuriRoutine();
goto retry;
@@ -155,6 +163,10 @@ void PersistWebDriveState(bool transition)
++_currentPage;
+ var elapsedTime = DateTime.Now.Subtract(initEntryDateTime);
+ _totalTimeSpan += elapsedTime;
+ ConsoleHelper.LogTime(elapsedTime, _totalTimeSpan);
+
if (newMoviesList.Count == 0)
goto summeryFinish;
}
@@ -167,6 +179,7 @@ void PersistWebDriveState(bool transition)
{
_currentPage = i;
ConsoleHelper.LogInfo($"Page: {_currentPage + 1}/{_maxPage}");
+ initEntryDateTime = DateTime.Now;
retry:
try
@@ -187,7 +200,9 @@ void PersistWebDriveState(bool transition)
}
catch (Exception e)
{
- if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Cannot Get Response."))
+ WebErrorReporter.HtmlError(htmlDocument);
+
+ if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Cannot Get Response.") || e.Message.Contains("Sequence contains no matching element"))
{
ConsoleHelper.LogError(e.Message);
@@ -199,7 +214,7 @@ void PersistWebDriveState(bool transition)
{
retryCount++;
- if (e.Message.Contains("Input string was not in a correct format."))
+ if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Sequence contains no matching element"))
BypassSurcuriRoutine();
goto retry;
@@ -234,6 +249,10 @@ void PersistWebDriveState(bool transition)
throw e;
}
}
+
+ var elapsedTime = DateTime.Now.Subtract(initEntryDateTime);
+ _totalTimeSpan += elapsedTime;
+ ConsoleHelper.LogTime(elapsedTime, _totalTimeSpan);
}
summeryFinish:
@@ -253,6 +272,7 @@ void PersistWebDriveState(bool transition)
int retryLimit = Configuration.Default.HtmlRetryLimit;
_currentPage = i;
ConsoleHelper.LogInfo($"Page: {_currentPage + 1}/{_maxPage}");
+ initEntryDateTime = DateTime.Now;
var movie = _websiteContext.MovieSummeries[i];
@@ -368,7 +388,9 @@ void PersistWebDriveState(bool transition)
}
catch (Exception e)
{
- if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Cannot Get Response."))
+ WebErrorReporter.HtmlError(htmlDocument);
+
+ if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Cannot Get Response.") || e.Message.Contains("Sequence contains no matching element"))
{
ConsoleHelper.LogError(e.Message);
@@ -380,7 +402,7 @@ void PersistWebDriveState(bool transition)
{
retryCount++;
- if (e.Message.Contains("Input string was not in a correct format."))
+ if (e.Message.Contains("Input string was not in a correct format.") || e.Message.Contains("Sequence contains no matching element"))
BypassSurcuriRoutine();
goto retry;
@@ -415,6 +437,10 @@ void PersistWebDriveState(bool transition)
throw e;
}
}
+
+ var elapsedTime = DateTime.Now.Subtract(initEntryDateTime);
+ _totalTimeSpan += elapsedTime;
+ ConsoleHelper.LogTime(elapsedTime, _totalTimeSpan);
}
_scrapperState = ScrapperState.Sora;
diff --git a/PaheScrapper/ScrapperMethods.cs b/PaheScrapper/ScrapperMethods.cs
index 566edc6..ba3d14a 100644
--- a/PaheScrapper/ScrapperMethods.cs
+++ b/PaheScrapper/ScrapperMethods.cs
@@ -229,7 +229,7 @@ void ProcessDownloadBox(MovieDetails tmp_details, HtmlNode tmp_downloadNode, str
string qualityNote = null;
foreach (var downloadHtml in downloadHtmls)
- {
+ {
var tmp_downloadHtml = downloadHtml.Replace(" ", "").TrimStart().TrimEnd();
MemoryStream ms = new MemoryStream();
@@ -347,7 +347,10 @@ void ProcessDownloadBox(MovieDetails tmp_details, HtmlNode tmp_downloadNode, str
if (qualityNote != null)
qualityNote = null;
- downloadLinkNodes = downloadLinkNodes.Where(l => l.Contains("{{")).ToArray();
+ downloadLinkNodes = downloadLinkNodes
+ .Where(l => l.Contains("{{"))
+ .Select(l=>l.Substring(0, l.LastIndexOf("}}", StringComparison.Ordinal) + 2))
+ .ToArray();
foreach (var downloadLinkNode in downloadLinkNodes)
{
@@ -355,7 +358,6 @@ void ProcessDownloadBox(MovieDetails tmp_details, HtmlNode tmp_downloadNode, str
}
episode.DownloadQualities.Add(downloadQuality);
- //Console.WriteLine($"Q|={downloadQuality.Quality}\n\tS|={downloadQuality.Size}\n\t\tN|={downloadQuality.Notes}");
}
tmp_details.Episodes.Add(episode);
diff --git a/PaheScrapper/ScrapperWeb.cs b/PaheScrapper/ScrapperWeb.cs
index 21a86fe..576511f 100644
--- a/PaheScrapper/ScrapperWeb.cs
+++ b/PaheScrapper/ScrapperWeb.cs
@@ -39,6 +39,7 @@ public static HtmlDocument GetDownloadHtml(string url, WebRequestHeader header)
}
catch (WebException e)
{
+ WebErrorReporter.HttpError(e.Response);
throw new ScrapperDownloaderException("Cannot Get Response.", e);
}
@@ -93,8 +94,9 @@ public static HtmlDocument PostDownloadHtml(string url, Dictionary
+
+
+
\ No newline at end of file